{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja.do03.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:16383', 'distributed_port': 16383, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja.do03.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja.do03.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja.do03.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.3, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=13.134, nll_loss=12.829, ppl=7277.34, wps=451877, ups=1.04, wpb=434909, bsz=16699.4, num_updates=100, lr=2.5e-05, gnorm=2.799, clip=84, loss_scale=4, train_wall=97, gb_free=18.6, wall=114 epoch 001: 201 / 1689 loss=11.753, nll_loss=11.261, ppl=2453.41, wps=458494, ups=1.05, wpb=435232, bsz=16930.2, num_updates=200, lr=5e-05, gnorm=1.988, clip=94, loss_scale=4, train_wall=94, gb_free=19.4, wall=209 epoch 001: 301 / 1689 loss=11.287, nll_loss=10.707, ppl=1671.7, wps=458306, ups=1.05, wpb=434938, bsz=16709.1, num_updates=300, lr=7.5e-05, gnorm=1.664, clip=90, loss_scale=4, train_wall=94, gb_free=19.9, wall=304 epoch 001: 401 / 1689 loss=10.676, nll_loss=9.98, ppl=1009.94, wps=454318, ups=1.05, wpb=432580, bsz=16463.2, num_updates=400, lr=0.0001, gnorm=1.507, clip=93, loss_scale=4, train_wall=94, gb_free=19.5, wall=399 epoch 001: 501 / 1689 loss=10.129, nll_loss=9.324, ppl=640.99, wps=458852, ups=1.06, wpb=433348, bsz=16501.3, num_updates=500, lr=0.000125, gnorm=1.404, clip=97, loss_scale=4, train_wall=94, gb_free=18.6, wall=493 epoch 001: 601 / 1689 loss=9.708, nll_loss=8.822, ppl=452.56, wps=455019, ups=1.05, wpb=433129, bsz=16487, num_updates=600, lr=0.00015, gnorm=1.298, clip=88, loss_scale=8, train_wall=94, gb_free=18.5, wall=589 epoch 001: 701 / 1689 loss=9.372, nll_loss=8.422, ppl=342.88, wps=460976, ups=1.06, wpb=434275, bsz=16441.5, num_updates=700, lr=0.000175, gnorm=1.205, clip=80, loss_scale=8, train_wall=93, gb_free=19, wall=683 epoch 001: 801 / 1689 loss=9.045, nll_loss=8.039, ppl=262.96, wps=456789, ups=1.05, wpb=435021, bsz=16338.7, num_updates=800, lr=0.0002, gnorm=1.055, clip=59, loss_scale=8, train_wall=94, gb_free=18.4, wall=778 epoch 001: 901 / 1689 loss=8.73, nll_loss=7.671, ppl=203.86, wps=456848, ups=1.05, wpb=434892, bsz=16540.2, num_updates=900, lr=0.000225, gnorm=1, clip=48, loss_scale=8, train_wall=94, gb_free=18.4, wall=873 epoch 001: 1001 / 1689 loss=8.427, nll_loss=7.319, ppl=159.69, wps=457028, ups=1.06, wpb=432114, bsz=16729.3, num_updates=1000, lr=0.00025, gnorm=0.927, clip=32, loss_scale=8, train_wall=93, gb_free=20.2, wall=968 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 8.153 | nll_loss 6.958 | ppl 124.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 1000 epoch 001: 1102 / 1689 loss=8.123, nll_loss=6.968, ppl=125.15, wps=379824, ups=0.88, wpb=429834, bsz=16582.3, num_updates=1100, lr=0.000275, gnorm=0.881, clip=23, loss_scale=8, train_wall=94, gb_free=18.4, wall=1081 epoch 001: 1202 / 1689 loss=7.842, nll_loss=6.643, ppl=99.96, wps=457228, ups=1.06, wpb=433198, bsz=16274.6, num_updates=1200, lr=0.0003, gnorm=0.846, clip=10, loss_scale=8, train_wall=93, gb_free=20.4, wall=1176 epoch 001: 1302 / 1689 loss=7.516, nll_loss=6.268, ppl=77.04, wps=457180, ups=1.06, wpb=432602, bsz=16372.2, num_updates=1300, lr=0.000325, gnorm=0.842, clip=22, loss_scale=8, train_wall=93, gb_free=20.6, wall=1270 epoch 001: 1402 / 1689 loss=7.213, nll_loss=5.919, ppl=60.49, wps=455602, ups=1.05, wpb=434746, bsz=16342.9, num_updates=1400, lr=0.00035, gnorm=0.815, clip=15, loss_scale=8, train_wall=93, gb_free=18.9, wall=1366 epoch 001: 1503 / 1689 loss=6.888, nll_loss=5.545, ppl=46.69, wps=454226, ups=1.04, wpb=435224, bsz=16490.6, num_updates=1500, lr=0.000375, gnorm=0.784, clip=11, loss_scale=4, train_wall=94, gb_free=17.8, wall=1462 epoch 001: 1603 / 1689 loss=6.608, nll_loss=5.226, ppl=37.44, wps=457946, ups=1.06, wpb=433888, bsz=16449.4, num_updates=1600, lr=0.0004, gnorm=0.731, clip=10, loss_scale=4, train_wall=93, gb_free=20.4, wall=1556 end of epoch 1 (average epoch stats below) epoch 001 | loss 9.015 | nll_loss 8.024 | ppl 260.28 | wps 451292 | ups 1.04 | wpb 433489 | bsz 16508.7 | num_updates 1686 | lr 0.0004215 | gnorm 1.206 | clip 51.2 | loss_scale 4 | train_wall 1582 | gb_free 21 | wall 1637 Start iterating over samples epoch 002: 14 / 1689 loss=6.38, nll_loss=4.968, ppl=31.29, wps=452726, ups=1.06, wpb=429069, bsz=16337, num_updates=1700, lr=0.000425, gnorm=0.696, clip=7, loss_scale=4, train_wall=93, gb_free=19.7, wall=1651 epoch 002: 14 / 1689 loss=6.38, nll_loss=4.968, ppl=31.29, wps=452726, ups=1.06, wpb=429069, bsz=16337, num_updates=1700, lr=0.000425, gnorm=0.696, clip=7, loss_scale=4, train_wall=93, gb_free=19.7, wall=1651 epoch 002: 114 / 1689 loss=6.182, nll_loss=4.746, ppl=26.83, wps=457784, ups=1.06, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.636, clip=3, loss_scale=4, train_wall=93, gb_free=19.6, wall=1746 epoch 002: 114 / 1689 loss=6.182, nll_loss=4.746, ppl=26.83, wps=457784, ups=1.06, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.636, clip=3, loss_scale=4, train_wall=93, gb_free=19.6, wall=1746 epoch 002: 214 / 1689 loss=6.034, nll_loss=4.58, ppl=23.92, wps=453580, ups=1.04, wpb=434234, bsz=16501.8, num_updates=1900, lr=0.000475, gnorm=0.591, clip=1, loss_scale=4, train_wall=94, gb_free=19.5, wall=1842 epoch 002: 214 / 1689 loss=6.034, nll_loss=4.58, ppl=23.92, wps=453580, ups=1.04, wpb=434234, bsz=16501.8, num_updates=1900, lr=0.000475, gnorm=0.591, clip=1, loss_scale=4, train_wall=94, gb_free=19.5, wall=1842 epoch 002: 314 / 1689 loss=5.881, nll_loss=4.411, ppl=21.27, wps=457193, ups=1.05, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.56, clip=0, loss_scale=8, train_wall=93, gb_free=18.7, wall=1937 epoch 002: 314 / 1689 loss=5.881, nll_loss=4.411, ppl=21.27, wps=457193, ups=1.05, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.56, clip=0, loss_scale=8, train_wall=93, gb_free=18.7, wall=1937 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.675 | nll_loss 4.062 | ppl 16.71 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.675 epoch 002 | valid on 'valid' subset | loss 5.675 | nll_loss 4.062 | ppl 16.71 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.675 epoch 002: 415 / 1689 loss=5.793, nll_loss=4.314, ppl=19.89, wps=377562, ups=0.87, wpb=432565, bsz=16216.9, num_updates=2100, lr=0.000525, gnorm=0.579, clip=3, loss_scale=4, train_wall=94, gb_free=18.7, wall=2051 epoch 002: 415 / 1689 loss=5.793, nll_loss=4.314, ppl=19.89, wps=377562, ups=0.87, wpb=432565, bsz=16216.9, num_updates=2100, lr=0.000525, gnorm=0.579, clip=3, loss_scale=4, train_wall=94, gb_free=18.7, wall=2051 epoch 002: 515 / 1689 loss=5.684, nll_loss=4.193, ppl=18.29, wps=460341, ups=1.06, wpb=435041, bsz=16516.2, num_updates=2200, lr=0.00055, gnorm=0.505, clip=1, loss_scale=4, train_wall=93, gb_free=19.2, wall=2146 epoch 002: 515 / 1689 loss=5.684, nll_loss=4.193, ppl=18.29, wps=460341, ups=1.06, wpb=435041, bsz=16516.2, num_updates=2200, lr=0.00055, gnorm=0.505, clip=1, loss_scale=4, train_wall=93, gb_free=19.2, wall=2146 epoch 002: 616 / 1689 loss=5.574, nll_loss=4.073, ppl=16.83, wps=454528, ups=1.05, wpb=432524, bsz=16610.3, num_updates=2300, lr=0.000575, gnorm=0.484, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=2241 epoch 002: 616 / 1689 loss=5.574, nll_loss=4.073, ppl=16.83, wps=454528, ups=1.05, wpb=432524, bsz=16610.3, num_updates=2300, lr=0.000575, gnorm=0.484, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=2241 epoch 002: 716 / 1689 loss=5.501, nll_loss=3.992, ppl=15.91, wps=461198, ups=1.06, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.485, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=2335 epoch 002: 716 / 1689 loss=5.501, nll_loss=3.992, ppl=15.91, wps=461198, ups=1.06, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.485, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=2335 epoch 002: 816 / 1689 loss=5.419, nll_loss=3.903, ppl=14.96, wps=455478, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.469, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=2430 epoch 002: 816 / 1689 loss=5.419, nll_loss=3.903, ppl=14.96, wps=455478, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.469, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=2430 epoch 002: 916 / 1689 loss=5.372, nll_loss=3.852, ppl=14.44, wps=458738, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.478, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=2524 epoch 002: 916 / 1689 loss=5.372, nll_loss=3.852, ppl=14.44, wps=458738, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.478, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=2524 epoch 002: 1016 / 1689 loss=5.304, nll_loss=3.778, ppl=13.72, wps=457146, ups=1.06, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.473, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=2619 epoch 002: 1016 / 1689 loss=5.304, nll_loss=3.778, ppl=13.72, wps=457146, ups=1.06, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.473, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=2619 epoch 002: 1116 / 1689 loss=5.262, nll_loss=3.733, ppl=13.29, wps=456193, ups=1.05, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.429, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=2714 epoch 002: 1116 / 1689 loss=5.262, nll_loss=3.733, ppl=13.29, wps=456193, ups=1.05, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.429, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=2714 epoch 002: 1216 / 1689 loss=5.207, nll_loss=3.673, ppl=12.76, wps=456615, ups=1.05, wpb=434473, bsz=16578.2, num_updates=2900, lr=0.000725, gnorm=0.453, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=2810 epoch 002: 1216 / 1689 loss=5.207, nll_loss=3.673, ppl=12.76, wps=456615, ups=1.05, wpb=434473, bsz=16578.2, num_updates=2900, lr=0.000725, gnorm=0.453, clip=0, loss_scale=4, train_wall=94, gb_free=18.6, wall=2810 epoch 002: 1316 / 1689 loss=5.172, nll_loss=3.636, ppl=12.43, wps=457066, ups=1.06, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.45, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=2904 epoch 002: 1316 / 1689 loss=5.172, nll_loss=3.636, ppl=12.43, wps=457066, ups=1.06, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.45, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=2904 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.03 | nll_loss 3.377 | ppl 10.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 5.03 epoch 002 | valid on 'valid' subset | loss 5.03 | nll_loss 3.377 | ppl 10.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 5.03 epoch 002: 1416 / 1689 loss=5.131, nll_loss=3.591, ppl=12.05, wps=383685, ups=0.89, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.448, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=3017 epoch 002: 1416 / 1689 loss=5.131, nll_loss=3.591, ppl=12.05, wps=383685, ups=0.89, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.448, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=3017 epoch 002: 1516 / 1689 loss=5.115, nll_loss=3.575, ppl=11.92, wps=462036, ups=1.07, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.433, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=3111 epoch 002: 1516 / 1689 loss=5.115, nll_loss=3.575, ppl=11.92, wps=462036, ups=1.07, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.433, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=3111 epoch 002: 1617 / 1689 loss=5.066, nll_loss=3.521, ppl=11.48, wps=457626, ups=1.05, wpb=434106, bsz=16336.5, num_updates=3300, lr=0.000825, gnorm=0.437, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=3206 epoch 002: 1617 / 1689 loss=5.066, nll_loss=3.521, ppl=11.48, wps=457626, ups=1.05, wpb=434106, bsz=16336.5, num_updates=3300, lr=0.000825, gnorm=0.437, clip=0, loss_scale=4, train_wall=94, gb_free=18.3, wall=3206 end of epoch 2 (average epoch stats below) epoch 002 | loss 5.47 | nll_loss 3.961 | ppl 15.58 | wps 446648 | ups 1.03 | wpb 433533 | bsz 16502.5 | num_updates 3372 | lr 0.000843 | gnorm 0.493 | clip 0.5 | loss_scale 4 | train_wall 1570 | gb_free 20.3 | wall 3274 epoch 002 | loss 5.47 | nll_loss 3.961 | ppl 15.58 | wps 446648 | ups 1.03 | wpb 433533 | bsz 16502.5 | num_updates 3372 | lr 0.000843 | gnorm 0.493 | clip 0.5 | loss_scale 4 | train_wall 1570 | gb_free 20.3 | wall 3274 Start iterating over samples epoch 003: 28 / 1689 loss=5.053, nll_loss=3.508, ppl=11.37, wps=455927, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.441, clip=0, loss_scale=4, train_wall=92, gb_free=17.6, wall=3301 epoch 003: 28 / 1689 loss=5.053, nll_loss=3.508, ppl=11.37, wps=455927, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.441, clip=0, loss_scale=4, train_wall=92, gb_free=17.6, wall=3301 epoch 003: 28 / 1689 loss=5.053, nll_loss=3.508, ppl=11.37, wps=455927, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.441, clip=0, loss_scale=4, train_wall=92, gb_free=17.6, wall=3301 epoch 003: 128 / 1689 loss=5.012, nll_loss=3.462, ppl=11.02, wps=459962, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.426, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3395 epoch 003: 128 / 1689 loss=5.012, nll_loss=3.462, ppl=11.02, wps=459962, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.426, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3395 epoch 003: 128 / 1689 loss=5.012, nll_loss=3.462, ppl=11.02, wps=459962, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.426, clip=0, loss_scale=4, train_wall=93, gb_free=17.6, wall=3395 epoch 003: 228 / 1689 loss=4.994, nll_loss=3.444, ppl=10.88, wps=459669, ups=1.06, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.44, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3489 epoch 003: 228 / 1689 loss=4.994, nll_loss=3.444, ppl=10.88, wps=459669, ups=1.06, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.44, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3489 epoch 003: 228 / 1689 loss=4.994, nll_loss=3.444, ppl=10.88, wps=459669, ups=1.06, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.44, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=3489 epoch 003: 328 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452411, ups=1.05, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.437, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=3585 epoch 003: 328 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452411, ups=1.05, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.437, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=3585 epoch 003: 328 / 1689 loss=4.969, nll_loss=3.417, ppl=10.68, wps=452411, ups=1.05, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.437, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=3585 epoch 003: 428 / 1689 loss=4.968, nll_loss=3.416, ppl=10.68, wps=455500, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.436, clip=0, loss_scale=8, train_wall=94, gb_free=20.5, wall=3680 epoch 003: 428 / 1689 loss=4.968, nll_loss=3.416, ppl=10.68, wps=455500, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.436, clip=0, loss_scale=8, train_wall=94, gb_free=20.5, wall=3680 epoch 003: 428 / 1689 loss=4.968, nll_loss=3.416, ppl=10.68, wps=455500, ups=1.05, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.436, clip=0, loss_scale=8, train_wall=94, gb_free=20.5, wall=3680 epoch 003: 529 / 1689 loss=4.954, nll_loss=3.402, ppl=10.57, wps=451409, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=4, train_wall=95, gb_free=18.6, wall=3776 epoch 003: 529 / 1689 loss=4.954, nll_loss=3.402, ppl=10.57, wps=451409, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=4, train_wall=95, gb_free=18.6, wall=3776 epoch 003: 529 / 1689 loss=4.954, nll_loss=3.402, ppl=10.57, wps=451409, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.419, clip=0, loss_scale=4, train_wall=95, gb_free=18.6, wall=3776 epoch 003: 629 / 1689 loss=4.938, nll_loss=3.384, ppl=10.44, wps=454285, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.436, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3872 epoch 003: 629 / 1689 loss=4.938, nll_loss=3.384, ppl=10.44, wps=454285, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.436, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3872 epoch 003: 629 / 1689 loss=4.938, nll_loss=3.384, ppl=10.44, wps=454285, ups=1.05, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.436, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3872 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.786 | nll_loss 3.123 | ppl 8.71 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.786 epoch 003 | valid on 'valid' subset | loss 4.786 | nll_loss 3.123 | ppl 8.71 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.786 epoch 003 | valid on 'valid' subset | loss 4.786 | nll_loss 3.123 | ppl 8.71 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.786 epoch 003: 730 / 1689 loss=4.937, nll_loss=3.384, ppl=10.44, wps=378352, ups=0.87, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.44, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=3987 epoch 003: 730 / 1689 loss=4.937, nll_loss=3.384, ppl=10.44, wps=378352, ups=0.87, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.44, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=3987 epoch 003: 730 / 1689 loss=4.937, nll_loss=3.384, ppl=10.44, wps=378352, ups=0.87, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.44, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=3987 epoch 003: 830 / 1689 loss=4.906, nll_loss=3.35, ppl=10.2, wps=461599, ups=1.07, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=4080 epoch 003: 830 / 1689 loss=4.906, nll_loss=3.35, ppl=10.2, wps=461599, ups=1.07, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=4080 epoch 003: 830 / 1689 loss=4.906, nll_loss=3.35, ppl=10.2, wps=461599, ups=1.07, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=4080 epoch 003: 930 / 1689 loss=4.887, nll_loss=3.329, ppl=10.05, wps=458636, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.414, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=4175 epoch 003: 930 / 1689 loss=4.887, nll_loss=3.329, ppl=10.05, wps=458636, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.414, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=4175 epoch 003: 930 / 1689 loss=4.887, nll_loss=3.329, ppl=10.05, wps=458636, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.414, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=4175 epoch 003: 1030 / 1689 loss=4.883, nll_loss=3.326, ppl=10.03, wps=460759, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=4269 epoch 003: 1030 / 1689 loss=4.883, nll_loss=3.326, ppl=10.03, wps=460759, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=4269 epoch 003: 1030 / 1689 loss=4.883, nll_loss=3.326, ppl=10.03, wps=460759, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=4269 epoch 003: 1130 / 1689 loss=4.87, nll_loss=3.312, ppl=9.93, wps=459672, ups=1.05, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.416, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=4364 epoch 003: 1130 / 1689 loss=4.87, nll_loss=3.312, ppl=9.93, wps=459672, ups=1.05, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.416, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=4364 epoch 003: 1130 / 1689 loss=4.87, nll_loss=3.312, ppl=9.93, wps=459672, ups=1.05, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.416, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=4364 epoch 003: 1231 / 1689 loss=4.849, nll_loss=3.289, ppl=9.78, wps=455331, ups=1.05, wpb=433553, bsz=16335.3, num_updates=4600, lr=0.000932505, gnorm=0.403, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=4460 epoch 003: 1231 / 1689 loss=4.849, nll_loss=3.289, ppl=9.78, wps=455331, ups=1.05, wpb=433553, bsz=16335.3, num_updates=4600, lr=0.000932505, gnorm=0.403, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=4460 epoch 003: 1231 / 1689 loss=4.849, nll_loss=3.289, ppl=9.78, wps=455331, ups=1.05, wpb=433553, bsz=16335.3, num_updates=4600, lr=0.000932505, gnorm=0.403, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=4460 epoch 003: 1331 / 1689 loss=4.824, nll_loss=3.262, ppl=9.59, wps=460371, ups=1.06, wpb=434179, bsz=16378.6, num_updates=4700, lr=0.000922531, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=4554 epoch 003: 1331 / 1689 loss=4.824, nll_loss=3.262, ppl=9.59, wps=460371, ups=1.06, wpb=434179, bsz=16378.6, num_updates=4700, lr=0.000922531, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=4554 epoch 003: 1331 / 1689 loss=4.824, nll_loss=3.262, ppl=9.59, wps=460371, ups=1.06, wpb=434179, bsz=16378.6, num_updates=4700, lr=0.000922531, gnorm=0.385, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=4554 epoch 003: 1431 / 1689 loss=4.823, nll_loss=3.262, ppl=9.59, wps=458280, ups=1.06, wpb=433420, bsz=16608.6, num_updates=4800, lr=0.000912871, gnorm=0.39, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=4648 epoch 003: 1431 / 1689 loss=4.823, nll_loss=3.262, ppl=9.59, wps=458280, ups=1.06, wpb=433420, bsz=16608.6, num_updates=4800, lr=0.000912871, gnorm=0.39, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=4648 epoch 003: 1431 / 1689 loss=4.823, nll_loss=3.262, ppl=9.59, wps=458280, ups=1.06, wpb=433420, bsz=16608.6, num_updates=4800, lr=0.000912871, gnorm=0.39, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=4648 epoch 003: 1531 / 1689 loss=4.798, nll_loss=3.234, ppl=9.41, wps=456172, ups=1.06, wpb=432104, bsz=16527.4, num_updates=4900, lr=0.000903508, gnorm=0.391, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=4743 epoch 003: 1531 / 1689 loss=4.798, nll_loss=3.234, ppl=9.41, wps=456172, ups=1.06, wpb=432104, bsz=16527.4, num_updates=4900, lr=0.000903508, gnorm=0.391, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=4743 epoch 003: 1531 / 1689 loss=4.798, nll_loss=3.234, ppl=9.41, wps=456172, ups=1.06, wpb=432104, bsz=16527.4, num_updates=4900, lr=0.000903508, gnorm=0.391, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=4743 epoch 003: 1631 / 1689 loss=4.79, nll_loss=3.224, ppl=9.35, wps=456527, ups=1.06, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=4838 epoch 003: 1631 / 1689 loss=4.79, nll_loss=3.224, ppl=9.35, wps=456527, ups=1.06, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=4838 epoch 003: 1631 / 1689 loss=4.79, nll_loss=3.224, ppl=9.35, wps=456527, ups=1.06, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=4838 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.648 | nll_loss 2.999 | ppl 8 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.648 epoch 003 | valid on 'valid' subset | loss 4.648 | nll_loss 2.999 | ppl 8 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.648 epoch 003 | valid on 'valid' subset | loss 4.648 | nll_loss 2.999 | ppl 8 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.648 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.898 | nll_loss 3.341 | ppl 10.14 | wps 439291 | ups 1.01 | wpb 433516 | bsz 16497.9 | num_updates 5058 | lr 0.000889284 | gnorm 0.414 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 19.4 | wall 4937 epoch 003 | loss 4.898 | nll_loss 3.341 | ppl 10.14 | wps 439291 | ups 1.01 | wpb 433516 | bsz 16497.9 | num_updates 5058 | lr 0.000889284 | gnorm 0.414 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 19.4 | wall 4937 epoch 003 | loss 4.898 | nll_loss 3.341 | ppl 10.14 | wps 439291 | ups 1.01 | wpb 433516 | bsz 16497.9 | num_updates 5058 | lr 0.000889284 | gnorm 0.414 | clip 0 | loss_scale 2 | train_wall 1572 | gb_free 19.4 | wall 4937 Start iterating over samples epoch 004: 42 / 1689 loss=4.767, nll_loss=3.2, ppl=9.19, wps=307660, ups=0.71, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.388, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=4978 epoch 004: 42 / 1689 loss=4.767, nll_loss=3.2, ppl=9.19, wps=307660, ups=0.71, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.388, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=4978 epoch 004: 42 / 1689 loss=4.767, nll_loss=3.2, ppl=9.19, wps=307660, ups=0.71, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.388, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=4978 epoch 004: 42 / 1689 loss=4.767, nll_loss=3.2, ppl=9.19, wps=307660, ups=0.71, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.388, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=4978 epoch 004: 142 / 1689 loss=4.745, nll_loss=3.175, ppl=9.03, wps=466079, ups=1.08, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=5071 epoch 004: 142 / 1689 loss=4.745, nll_loss=3.175, ppl=9.03, wps=466079, ups=1.08, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=5071 epoch 004: 142 / 1689 loss=4.745, nll_loss=3.175, ppl=9.03, wps=466079, ups=1.08, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=5071 epoch 004: 142 / 1689 loss=4.745, nll_loss=3.175, ppl=9.03, wps=466079, ups=1.08, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=5071 epoch 004: 242 / 1689 loss=4.741, nll_loss=3.17, ppl=9, wps=459254, ups=1.06, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.388, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5165 epoch 004: 242 / 1689 loss=4.741, nll_loss=3.17, ppl=9, wps=459254, ups=1.06, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.388, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5165 epoch 004: 242 / 1689 loss=4.741, nll_loss=3.17, ppl=9, wps=459254, ups=1.06, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.388, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5165 epoch 004: 242 / 1689 loss=4.741, nll_loss=3.17, ppl=9, wps=459254, ups=1.06, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.388, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=5165 epoch 004: 343 / 1689 loss=4.738, nll_loss=3.168, ppl=8.99, wps=458400, ups=1.05, wpb=434522, bsz=16721, num_updates=5400, lr=0.000860663, gnorm=0.375, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=5260 epoch 004: 343 / 1689 loss=4.738, nll_loss=3.168, ppl=8.99, wps=458400, ups=1.05, wpb=434522, bsz=16721, num_updates=5400, lr=0.000860663, gnorm=0.375, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=5260 epoch 004: 343 / 1689 loss=4.738, nll_loss=3.168, ppl=8.99, wps=458400, ups=1.05, wpb=434522, bsz=16721, num_updates=5400, lr=0.000860663, gnorm=0.375, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=5260 epoch 004: 343 / 1689 loss=4.738, nll_loss=3.168, ppl=8.99, wps=458400, ups=1.05, wpb=434522, bsz=16721, num_updates=5400, lr=0.000860663, gnorm=0.375, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=5260 epoch 004: 443 / 1689 loss=4.73, nll_loss=3.16, ppl=8.94, wps=461313, ups=1.07, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.383, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=5353 epoch 004: 443 / 1689 loss=4.73, nll_loss=3.16, ppl=8.94, wps=461313, ups=1.07, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.383, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=5353 epoch 004: 443 / 1689 loss=4.73, nll_loss=3.16, ppl=8.94, wps=461313, ups=1.07, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.383, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=5353 epoch 004: 443 / 1689 loss=4.73, nll_loss=3.16, ppl=8.94, wps=461313, ups=1.07, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.383, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=5353 epoch 004: 543 / 1689 loss=4.726, nll_loss=3.156, ppl=8.91, wps=464744, ups=1.07, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.377, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=5447 epoch 004: 543 / 1689 loss=4.726, nll_loss=3.156, ppl=8.91, wps=464744, ups=1.07, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.377, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=5447 epoch 004: 543 / 1689 loss=4.726, nll_loss=3.156, ppl=8.91, wps=464744, ups=1.07, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.377, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=5447 epoch 004: 543 / 1689 loss=4.726, nll_loss=3.156, ppl=8.91, wps=464744, ups=1.07, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.377, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=5447 epoch 004: 643 / 1689 loss=4.712, nll_loss=3.141, ppl=8.82, wps=458433, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.374, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=5541 epoch 004: 643 / 1689 loss=4.712, nll_loss=3.141, ppl=8.82, wps=458433, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.374, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=5541 epoch 004: 643 / 1689 loss=4.712, nll_loss=3.141, ppl=8.82, wps=458433, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.374, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=5541 epoch 004: 643 / 1689 loss=4.712, nll_loss=3.141, ppl=8.82, wps=458433, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.374, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=5541 epoch 004: 743 / 1689 loss=4.707, nll_loss=3.136, ppl=8.79, wps=460703, ups=1.06, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=5636 epoch 004: 743 / 1689 loss=4.707, nll_loss=3.136, ppl=8.79, wps=460703, ups=1.06, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=5636 epoch 004: 743 / 1689 loss=4.707, nll_loss=3.136, ppl=8.79, wps=460703, ups=1.06, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=5636 epoch 004: 743 / 1689 loss=4.707, nll_loss=3.136, ppl=8.79, wps=460703, ups=1.06, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.392, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=5636 epoch 004: 843 / 1689 loss=4.696, nll_loss=3.123, ppl=8.71, wps=461574, ups=1.06, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5730 epoch 004: 843 / 1689 loss=4.696, nll_loss=3.123, ppl=8.71, wps=461574, ups=1.06, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5730 epoch 004: 843 / 1689 loss=4.696, nll_loss=3.123, ppl=8.71, wps=461574, ups=1.06, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5730 epoch 004: 843 / 1689 loss=4.696, nll_loss=3.123, ppl=8.71, wps=461574, ups=1.06, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5730 epoch 004: 943 / 1689 loss=4.687, nll_loss=3.113, ppl=8.65, wps=458218, ups=1.06, wpb=433850, bsz=16402.2, num_updates=6000, lr=0.000816497, gnorm=0.381, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=5824 epoch 004: 943 / 1689 loss=4.687, nll_loss=3.113, ppl=8.65, wps=458218, ups=1.06, wpb=433850, bsz=16402.2, num_updates=6000, lr=0.000816497, gnorm=0.381, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=5824 epoch 004: 943 / 1689 loss=4.687, nll_loss=3.113, ppl=8.65, wps=458218, ups=1.06, wpb=433850, bsz=16402.2, num_updates=6000, lr=0.000816497, gnorm=0.381, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=5824 epoch 004: 943 / 1689 loss=4.687, nll_loss=3.113, ppl=8.65, wps=458218, ups=1.06, wpb=433850, bsz=16402.2, num_updates=6000, lr=0.000816497, gnorm=0.381, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=5824 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.575 | nll_loss 2.922 | ppl 7.58 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.575 epoch 004 | valid on 'valid' subset | loss 4.575 | nll_loss 2.922 | ppl 7.58 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.575 epoch 004 | valid on 'valid' subset | loss 4.575 | nll_loss 2.922 | ppl 7.58 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.575 epoch 004 | valid on 'valid' subset | loss 4.575 | nll_loss 2.922 | ppl 7.58 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.575 epoch 004: 1043 / 1689 loss=4.689, nll_loss=3.117, ppl=8.67, wps=386949, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.7, wall=5937 epoch 004: 1043 / 1689 loss=4.689, nll_loss=3.117, ppl=8.67, wps=386949, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.7, wall=5937 epoch 004: 1043 / 1689 loss=4.689, nll_loss=3.117, ppl=8.67, wps=386949, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.7, wall=5937 epoch 004: 1043 / 1689 loss=4.689, nll_loss=3.117, ppl=8.67, wps=386949, ups=0.89, wpb=433826, bsz=16702.7, num_updates=6100, lr=0.000809776, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.7, wall=5937 epoch 004: 1144 / 1689 loss=4.677, nll_loss=3.103, ppl=8.59, wps=452896, ups=1.04, wpb=434415, bsz=16582.6, num_updates=6200, lr=0.000803219, gnorm=0.368, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=6033 epoch 004: 1144 / 1689 loss=4.677, nll_loss=3.103, ppl=8.59, wps=452896, ups=1.04, wpb=434415, bsz=16582.6, num_updates=6200, lr=0.000803219, gnorm=0.368, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=6033 epoch 004: 1144 / 1689 loss=4.677, nll_loss=3.103, ppl=8.59, wps=452896, ups=1.04, wpb=434415, bsz=16582.6, num_updates=6200, lr=0.000803219, gnorm=0.368, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=6033 epoch 004: 1144 / 1689 loss=4.677, nll_loss=3.103, ppl=8.59, wps=452896, ups=1.04, wpb=434415, bsz=16582.6, num_updates=6200, lr=0.000803219, gnorm=0.368, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=6033 epoch 004: 1244 / 1689 loss=4.673, nll_loss=3.099, ppl=8.57, wps=461673, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6127 epoch 004: 1244 / 1689 loss=4.673, nll_loss=3.099, ppl=8.57, wps=461673, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6127 epoch 004: 1244 / 1689 loss=4.673, nll_loss=3.099, ppl=8.57, wps=461673, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6127 epoch 004: 1244 / 1689 loss=4.673, nll_loss=3.099, ppl=8.57, wps=461673, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.369, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6127 epoch 004: 1344 / 1689 loss=4.665, nll_loss=3.091, ppl=8.52, wps=456292, ups=1.05, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6221 epoch 004: 1344 / 1689 loss=4.665, nll_loss=3.091, ppl=8.52, wps=456292, ups=1.05, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6221 epoch 004: 1344 / 1689 loss=4.665, nll_loss=3.091, ppl=8.52, wps=456292, ups=1.05, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6221 epoch 004: 1344 / 1689 loss=4.665, nll_loss=3.091, ppl=8.52, wps=456292, ups=1.05, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6221 epoch 004: 1444 / 1689 loss=4.658, nll_loss=3.083, ppl=8.47, wps=458730, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6316 epoch 004: 1444 / 1689 loss=4.658, nll_loss=3.083, ppl=8.47, wps=458730, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6316 epoch 004: 1444 / 1689 loss=4.658, nll_loss=3.083, ppl=8.47, wps=458730, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6316 epoch 004: 1444 / 1689 loss=4.658, nll_loss=3.083, ppl=8.47, wps=458730, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6316 epoch 004: 1544 / 1689 loss=4.657, nll_loss=3.081, ppl=8.46, wps=461484, ups=1.06, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.362, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=6410 epoch 004: 1544 / 1689 loss=4.657, nll_loss=3.081, ppl=8.46, wps=461484, ups=1.06, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.362, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=6410 epoch 004: 1544 / 1689 loss=4.657, nll_loss=3.081, ppl=8.46, wps=461484, ups=1.06, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.362, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=6410 epoch 004: 1544 / 1689 loss=4.657, nll_loss=3.081, ppl=8.46, wps=461484, ups=1.06, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.362, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=6410 epoch 004: 1644 / 1689 loss=4.655, nll_loss=3.08, ppl=8.46, wps=459766, ups=1.06, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.5, wall=6504 epoch 004: 1644 / 1689 loss=4.655, nll_loss=3.08, ppl=8.46, wps=459766, ups=1.06, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.5, wall=6504 epoch 004: 1644 / 1689 loss=4.655, nll_loss=3.08, ppl=8.46, wps=459766, ups=1.06, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.5, wall=6504 epoch 004: 1644 / 1689 loss=4.655, nll_loss=3.08, ppl=8.46, wps=459766, ups=1.06, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.37, clip=0, loss_scale=4, train_wall=92, gb_free=19.5, wall=6504 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.697 | nll_loss 3.125 | ppl 8.72 | wps 454674 | ups 1.05 | wpb 433521 | bsz 16503 | num_updates 6745 | lr 0.000770086 | gnorm 0.374 | clip 0 | loss_scale 4 | train_wall 1567 | gb_free 19.4 | wall 6546 epoch 004 | loss 4.697 | nll_loss 3.125 | ppl 8.72 | wps 454674 | ups 1.05 | wpb 433521 | bsz 16503 | num_updates 6745 | lr 0.000770086 | gnorm 0.374 | clip 0 | loss_scale 4 | train_wall 1567 | gb_free 19.4 | wall 6546 epoch 004 | loss 4.697 | nll_loss 3.125 | ppl 8.72 | wps 454674 | ups 1.05 | wpb 433521 | bsz 16503 | num_updates 6745 | lr 0.000770086 | gnorm 0.374 | clip 0 | loss_scale 4 | train_wall 1567 | gb_free 19.4 | wall 6546 epoch 004 | loss 4.697 | nll_loss 3.125 | ppl 8.72 | wps 454674 | ups 1.05 | wpb 433521 | bsz 16503 | num_updates 6745 | lr 0.000770086 | gnorm 0.374 | clip 0 | loss_scale 4 | train_wall 1567 | gb_free 19.4 | wall 6546 Start iterating over samples epoch 005: 55 / 1689 loss=4.624, nll_loss=3.044, ppl=8.25, wps=456085, ups=1.06, wpb=429079, bsz=16276.2, num_updates=6800, lr=0.000766965, gnorm=0.358, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=6599 epoch 005: 55 / 1689 loss=4.624, nll_loss=3.044, ppl=8.25, wps=456085, ups=1.06, wpb=429079, bsz=16276.2, num_updates=6800, lr=0.000766965, gnorm=0.358, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=6599 epoch 005: 55 / 1689 loss=4.624, nll_loss=3.044, ppl=8.25, wps=456085, ups=1.06, wpb=429079, bsz=16276.2, num_updates=6800, lr=0.000766965, gnorm=0.358, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=6599 epoch 005: 55 / 1689 loss=4.624, nll_loss=3.044, ppl=8.25, wps=456085, ups=1.06, wpb=429079, bsz=16276.2, num_updates=6800, lr=0.000766965, gnorm=0.358, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=6599 epoch 005: 55 / 1689 loss=4.624, nll_loss=3.044, ppl=8.25, wps=456085, ups=1.06, wpb=429079, bsz=16276.2, num_updates=6800, lr=0.000766965, gnorm=0.358, clip=0, loss_scale=4, train_wall=92, gb_free=21, wall=6599 epoch 005: 155 / 1689 loss=4.619, nll_loss=3.039, ppl=8.22, wps=460336, ups=1.06, wpb=434345, bsz=16386.6, num_updates=6900, lr=0.000761387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6693 epoch 005: 155 / 1689 loss=4.619, nll_loss=3.039, ppl=8.22, wps=460336, ups=1.06, wpb=434345, bsz=16386.6, num_updates=6900, lr=0.000761387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6693 epoch 005: 155 / 1689 loss=4.619, nll_loss=3.039, ppl=8.22, wps=460336, ups=1.06, wpb=434345, bsz=16386.6, num_updates=6900, lr=0.000761387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6693 epoch 005: 155 / 1689 loss=4.619, nll_loss=3.039, ppl=8.22, wps=460336, ups=1.06, wpb=434345, bsz=16386.6, num_updates=6900, lr=0.000761387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6693 epoch 005: 155 / 1689 loss=4.619, nll_loss=3.039, ppl=8.22, wps=460336, ups=1.06, wpb=434345, bsz=16386.6, num_updates=6900, lr=0.000761387, gnorm=0.368, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=6693 epoch 005: 255 / 1689 loss=4.615, nll_loss=3.034, ppl=8.19, wps=458558, ups=1.06, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.373, clip=0, loss_scale=4, train_wall=92, gb_free=20.7, wall=6787 epoch 005: 255 / 1689 loss=4.615, nll_loss=3.034, ppl=8.19, wps=458558, ups=1.06, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.373, clip=0, loss_scale=4, train_wall=92, gb_free=20.7, wall=6787 epoch 005: 255 / 1689 loss=4.615, nll_loss=3.034, ppl=8.19, wps=458558, ups=1.06, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.373, clip=0, loss_scale=4, train_wall=92, gb_free=20.7, wall=6787 epoch 005: 255 / 1689 loss=4.615, nll_loss=3.034, ppl=8.19, wps=458558, ups=1.06, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.373, clip=0, loss_scale=4, train_wall=92, gb_free=20.7, wall=6787 epoch 005: 255 / 1689 loss=4.615, nll_loss=3.034, ppl=8.19, wps=458558, ups=1.06, wpb=432647, bsz=16303, num_updates=7000, lr=0.000755929, gnorm=0.373, clip=0, loss_scale=4, train_wall=92, gb_free=20.7, wall=6787 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.552 | nll_loss 2.894 | ppl 7.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.552 epoch 005 | valid on 'valid' subset | loss 4.552 | nll_loss 2.894 | ppl 7.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.552 epoch 005 | valid on 'valid' subset | loss 4.552 | nll_loss 2.894 | ppl 7.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.552 epoch 005 | valid on 'valid' subset | loss 4.552 | nll_loss 2.894 | ppl 7.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.552 epoch 005 | valid on 'valid' subset | loss 4.552 | nll_loss 2.894 | ppl 7.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.552 epoch 005: 355 / 1689 loss=4.603, nll_loss=3.022, ppl=8.12, wps=377894, ups=0.87, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=6902 epoch 005: 355 / 1689 loss=4.603, nll_loss=3.022, ppl=8.12, wps=377894, ups=0.87, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=6902 epoch 005: 355 / 1689 loss=4.603, nll_loss=3.022, ppl=8.12, wps=377894, ups=0.87, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=6902 epoch 005: 355 / 1689 loss=4.603, nll_loss=3.022, ppl=8.12, wps=377894, ups=0.87, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=6902 epoch 005: 355 / 1689 loss=4.603, nll_loss=3.022, ppl=8.12, wps=377894, ups=0.87, wpb=434696, bsz=16697.7, num_updates=7100, lr=0.000750587, gnorm=0.364, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=6902 epoch 005: 455 / 1689 loss=4.612, nll_loss=3.032, ppl=8.18, wps=457718, ups=1.06, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.351, clip=0, loss_scale=8, train_wall=93, gb_free=18.6, wall=6997 epoch 005: 455 / 1689 loss=4.612, nll_loss=3.032, ppl=8.18, wps=457718, ups=1.06, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.351, clip=0, loss_scale=8, train_wall=93, gb_free=18.6, wall=6997 epoch 005: 455 / 1689 loss=4.612, nll_loss=3.032, ppl=8.18, wps=457718, ups=1.06, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.351, clip=0, loss_scale=8, train_wall=93, gb_free=18.6, wall=6997 epoch 005: 455 / 1689 loss=4.612, nll_loss=3.032, ppl=8.18, wps=457718, ups=1.06, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.351, clip=0, loss_scale=8, train_wall=93, gb_free=18.6, wall=6997 epoch 005: 455 / 1689 loss=4.612, nll_loss=3.032, ppl=8.18, wps=457718, ups=1.06, wpb=432710, bsz=16605.7, num_updates=7200, lr=0.000745356, gnorm=0.351, clip=0, loss_scale=8, train_wall=93, gb_free=18.6, wall=6997 epoch 005: 556 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=453038, ups=1.04, wpb=434097, bsz=16616, num_updates=7300, lr=0.000740233, gnorm=0.351, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=7093 epoch 005: 556 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=453038, ups=1.04, wpb=434097, bsz=16616, num_updates=7300, lr=0.000740233, gnorm=0.351, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=7093 epoch 005: 556 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=453038, ups=1.04, wpb=434097, bsz=16616, num_updates=7300, lr=0.000740233, gnorm=0.351, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=7093 epoch 005: 556 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=453038, ups=1.04, wpb=434097, bsz=16616, num_updates=7300, lr=0.000740233, gnorm=0.351, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=7093 epoch 005: 556 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=453038, ups=1.04, wpb=434097, bsz=16616, num_updates=7300, lr=0.000740233, gnorm=0.351, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=7093 epoch 005: 656 / 1689 loss=4.606, nll_loss=3.026, ppl=8.14, wps=459078, ups=1.06, wpb=433328, bsz=16463.8, num_updates=7400, lr=0.000735215, gnorm=0.361, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=7187 epoch 005: 656 / 1689 loss=4.606, nll_loss=3.026, ppl=8.14, wps=459078, ups=1.06, wpb=433328, bsz=16463.8, num_updates=7400, lr=0.000735215, gnorm=0.361, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=7187 epoch 005: 656 / 1689 loss=4.606, nll_loss=3.026, ppl=8.14, wps=459078, ups=1.06, wpb=433328, bsz=16463.8, num_updates=7400, lr=0.000735215, gnorm=0.361, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=7187 epoch 005: 656 / 1689 loss=4.606, nll_loss=3.026, ppl=8.14, wps=459078, ups=1.06, wpb=433328, bsz=16463.8, num_updates=7400, lr=0.000735215, gnorm=0.361, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=7187 epoch 005: 656 / 1689 loss=4.606, nll_loss=3.026, ppl=8.14, wps=459078, ups=1.06, wpb=433328, bsz=16463.8, num_updates=7400, lr=0.000735215, gnorm=0.361, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=7187 epoch 005: 756 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=457278, ups=1.06, wpb=431285, bsz=16406.2, num_updates=7500, lr=0.000730297, gnorm=0.345, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=7281 epoch 005: 756 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=457278, ups=1.06, wpb=431285, bsz=16406.2, num_updates=7500, lr=0.000730297, gnorm=0.345, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=7281 epoch 005: 756 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=457278, ups=1.06, wpb=431285, bsz=16406.2, num_updates=7500, lr=0.000730297, gnorm=0.345, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=7281 epoch 005: 756 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=457278, ups=1.06, wpb=431285, bsz=16406.2, num_updates=7500, lr=0.000730297, gnorm=0.345, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=7281 epoch 005: 756 / 1689 loss=4.597, nll_loss=3.016, ppl=8.09, wps=457278, ups=1.06, wpb=431285, bsz=16406.2, num_updates=7500, lr=0.000730297, gnorm=0.345, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=7281 epoch 005: 857 / 1689 loss=4.591, nll_loss=3.01, ppl=8.05, wps=452531, ups=1.04, wpb=435224, bsz=16500.9, num_updates=7600, lr=0.000725476, gnorm=0.364, clip=0, loss_scale=2, train_wall=95, gb_free=17.5, wall=7378 epoch 005: 857 / 1689 loss=4.591, nll_loss=3.01, ppl=8.05, wps=452531, ups=1.04, wpb=435224, bsz=16500.9, num_updates=7600, lr=0.000725476, gnorm=0.364, clip=0, loss_scale=2, train_wall=95, gb_free=17.5, wall=7378 epoch 005: 857 / 1689 loss=4.591, nll_loss=3.01, ppl=8.05, wps=452531, ups=1.04, wpb=435224, bsz=16500.9, num_updates=7600, lr=0.000725476, gnorm=0.364, clip=0, loss_scale=2, train_wall=95, gb_free=17.5, wall=7378 epoch 005: 857 / 1689 loss=4.591, nll_loss=3.01, ppl=8.05, wps=452531, ups=1.04, wpb=435224, bsz=16500.9, num_updates=7600, lr=0.000725476, gnorm=0.364, clip=0, loss_scale=2, train_wall=95, gb_free=17.5, wall=7378 epoch 005: 857 / 1689 loss=4.591, nll_loss=3.01, ppl=8.05, wps=452531, ups=1.04, wpb=435224, bsz=16500.9, num_updates=7600, lr=0.000725476, gnorm=0.364, clip=0, loss_scale=2, train_wall=95, gb_free=17.5, wall=7378 epoch 005: 957 / 1689 loss=4.595, nll_loss=3.015, ppl=8.08, wps=460729, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7472 epoch 005: 957 / 1689 loss=4.595, nll_loss=3.015, ppl=8.08, wps=460729, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7472 epoch 005: 957 / 1689 loss=4.595, nll_loss=3.015, ppl=8.08, wps=460729, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7472 epoch 005: 957 / 1689 loss=4.595, nll_loss=3.015, ppl=8.08, wps=460729, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7472 epoch 005: 957 / 1689 loss=4.595, nll_loss=3.015, ppl=8.08, wps=460729, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=7472 epoch 005: 1057 / 1689 loss=4.575, nll_loss=2.993, ppl=7.96, wps=461065, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=7566 epoch 005: 1057 / 1689 loss=4.575, nll_loss=2.993, ppl=7.96, wps=461065, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=7566 epoch 005: 1057 / 1689 loss=4.575, nll_loss=2.993, ppl=7.96, wps=461065, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=7566 epoch 005: 1057 / 1689 loss=4.575, nll_loss=2.993, ppl=7.96, wps=461065, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=7566 epoch 005: 1057 / 1689 loss=4.575, nll_loss=2.993, ppl=7.96, wps=461065, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=7566 epoch 005: 1157 / 1689 loss=4.577, nll_loss=2.995, ppl=7.97, wps=459599, ups=1.06, wpb=435090, bsz=16662.6, num_updates=7900, lr=0.000711568, gnorm=0.341, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=7661 epoch 005: 1157 / 1689 loss=4.577, nll_loss=2.995, ppl=7.97, wps=459599, ups=1.06, wpb=435090, bsz=16662.6, num_updates=7900, lr=0.000711568, gnorm=0.341, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=7661 epoch 005: 1157 / 1689 loss=4.577, nll_loss=2.995, ppl=7.97, wps=459599, ups=1.06, wpb=435090, bsz=16662.6, num_updates=7900, lr=0.000711568, gnorm=0.341, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=7661 epoch 005: 1157 / 1689 loss=4.577, nll_loss=2.995, ppl=7.97, wps=459599, ups=1.06, wpb=435090, bsz=16662.6, num_updates=7900, lr=0.000711568, gnorm=0.341, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=7661 epoch 005: 1157 / 1689 loss=4.577, nll_loss=2.995, ppl=7.97, wps=459599, ups=1.06, wpb=435090, bsz=16662.6, num_updates=7900, lr=0.000711568, gnorm=0.341, clip=0, loss_scale=2, train_wall=93, gb_free=17.6, wall=7661 epoch 005: 1257 / 1689 loss=4.58, nll_loss=2.998, ppl=7.99, wps=459238, ups=1.06, wpb=434275, bsz=16374.6, num_updates=8000, lr=0.000707107, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=7755 epoch 005: 1257 / 1689 loss=4.58, nll_loss=2.998, ppl=7.99, wps=459238, ups=1.06, wpb=434275, bsz=16374.6, num_updates=8000, lr=0.000707107, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=7755 epoch 005: 1257 / 1689 loss=4.58, nll_loss=2.998, ppl=7.99, wps=459238, ups=1.06, wpb=434275, bsz=16374.6, num_updates=8000, lr=0.000707107, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=7755 epoch 005: 1257 / 1689 loss=4.58, nll_loss=2.998, ppl=7.99, wps=459238, ups=1.06, wpb=434275, bsz=16374.6, num_updates=8000, lr=0.000707107, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=7755 epoch 005: 1257 / 1689 loss=4.58, nll_loss=2.998, ppl=7.99, wps=459238, ups=1.06, wpb=434275, bsz=16374.6, num_updates=8000, lr=0.000707107, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=7755 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.469 | nll_loss 2.818 | ppl 7.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.469 epoch 005 | valid on 'valid' subset | loss 4.469 | nll_loss 2.818 | ppl 7.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.469 epoch 005 | valid on 'valid' subset | loss 4.469 | nll_loss 2.818 | ppl 7.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.469 epoch 005 | valid on 'valid' subset | loss 4.469 | nll_loss 2.818 | ppl 7.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.469 epoch 005 | valid on 'valid' subset | loss 4.469 | nll_loss 2.818 | ppl 7.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.469 epoch 005: 1357 / 1689 loss=4.576, nll_loss=2.994, ppl=7.97, wps=380923, ups=0.88, wpb=434803, bsz=16743.4, num_updates=8100, lr=0.000702728, gnorm=0.337, clip=0, loss_scale=4, train_wall=94, gb_free=17.5, wall=7870 epoch 005: 1357 / 1689 loss=4.576, nll_loss=2.994, ppl=7.97, wps=380923, ups=0.88, wpb=434803, bsz=16743.4, num_updates=8100, lr=0.000702728, gnorm=0.337, clip=0, loss_scale=4, train_wall=94, gb_free=17.5, wall=7870 epoch 005: 1357 / 1689 loss=4.576, nll_loss=2.994, ppl=7.97, wps=380923, ups=0.88, wpb=434803, bsz=16743.4, num_updates=8100, lr=0.000702728, gnorm=0.337, clip=0, loss_scale=4, train_wall=94, gb_free=17.5, wall=7870 epoch 005: 1357 / 1689 loss=4.576, nll_loss=2.994, ppl=7.97, wps=380923, ups=0.88, wpb=434803, bsz=16743.4, num_updates=8100, lr=0.000702728, gnorm=0.337, clip=0, loss_scale=4, train_wall=94, gb_free=17.5, wall=7870 epoch 005: 1357 / 1689 loss=4.576, nll_loss=2.994, ppl=7.97, wps=380923, ups=0.88, wpb=434803, bsz=16743.4, num_updates=8100, lr=0.000702728, gnorm=0.337, clip=0, loss_scale=4, train_wall=94, gb_free=17.5, wall=7870 epoch 005: 1457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=462879, ups=1.07, wpb=433309, bsz=16459.7, num_updates=8200, lr=0.00069843, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=7963 epoch 005: 1457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=462879, ups=1.07, wpb=433309, bsz=16459.7, num_updates=8200, lr=0.00069843, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=7963 epoch 005: 1457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=462879, ups=1.07, wpb=433309, bsz=16459.7, num_updates=8200, lr=0.00069843, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=7963 epoch 005: 1457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=462879, ups=1.07, wpb=433309, bsz=16459.7, num_updates=8200, lr=0.00069843, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=7963 epoch 005: 1457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=462879, ups=1.07, wpb=433309, bsz=16459.7, num_updates=8200, lr=0.00069843, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=7963 epoch 005: 1557 / 1689 loss=4.563, nll_loss=2.98, ppl=7.89, wps=461809, ups=1.07, wpb=431965, bsz=16188.5, num_updates=8300, lr=0.00069421, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=8057 epoch 005: 1557 / 1689 loss=4.563, nll_loss=2.98, ppl=7.89, wps=461809, ups=1.07, wpb=431965, bsz=16188.5, num_updates=8300, lr=0.00069421, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=8057 epoch 005: 1557 / 1689 loss=4.563, nll_loss=2.98, ppl=7.89, wps=461809, ups=1.07, wpb=431965, bsz=16188.5, num_updates=8300, lr=0.00069421, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=8057 epoch 005: 1557 / 1689 loss=4.563, nll_loss=2.98, ppl=7.89, wps=461809, ups=1.07, wpb=431965, bsz=16188.5, num_updates=8300, lr=0.00069421, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=8057 epoch 005: 1557 / 1689 loss=4.563, nll_loss=2.98, ppl=7.89, wps=461809, ups=1.07, wpb=431965, bsz=16188.5, num_updates=8300, lr=0.00069421, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=8057 epoch 005: 1657 / 1689 loss=4.569, nll_loss=2.988, ppl=7.93, wps=459827, ups=1.06, wpb=432865, bsz=16852.4, num_updates=8400, lr=0.000690066, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=8151 epoch 005: 1657 / 1689 loss=4.569, nll_loss=2.988, ppl=7.93, wps=459827, ups=1.06, wpb=432865, bsz=16852.4, num_updates=8400, lr=0.000690066, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=8151 epoch 005: 1657 / 1689 loss=4.569, nll_loss=2.988, ppl=7.93, wps=459827, ups=1.06, wpb=432865, bsz=16852.4, num_updates=8400, lr=0.000690066, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=8151 epoch 005: 1657 / 1689 loss=4.569, nll_loss=2.988, ppl=7.93, wps=459827, ups=1.06, wpb=432865, bsz=16852.4, num_updates=8400, lr=0.000690066, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=8151 epoch 005: 1657 / 1689 loss=4.569, nll_loss=2.988, ppl=7.93, wps=459827, ups=1.06, wpb=432865, bsz=16852.4, num_updates=8400, lr=0.000690066, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=8151 end of epoch 5 (average epoch stats below) epoch 005 | loss 4.591 | nll_loss 3.009 | ppl 8.05 | wps 447513 | ups 1.03 | wpb 433530 | bsz 16505.8 | num_updates 8432 | lr 0.000688755 | gnorm 0.354 | clip 0 | loss_scale 4 | train_wall 1568 | gb_free 23.8 | wall 8180 epoch 005 | loss 4.591 | nll_loss 3.009 | ppl 8.05 | wps 447513 | ups 1.03 | wpb 433530 | bsz 16505.8 | num_updates 8432 | lr 0.000688755 | gnorm 0.354 | clip 0 | loss_scale 4 | train_wall 1568 | gb_free 23.8 | wall 8180 epoch 005 | loss 4.591 | nll_loss 3.009 | ppl 8.05 | wps 447513 | ups 1.03 | wpb 433530 | bsz 16505.8 | num_updates 8432 | lr 0.000688755 | gnorm 0.354 | clip 0 | loss_scale 4 | train_wall 1568 | gb_free 23.8 | wall 8180 epoch 005 | loss 4.591 | nll_loss 3.009 | ppl 8.05 | wps 447513 | ups 1.03 | wpb 433530 | bsz 16505.8 | num_updates 8432 | lr 0.000688755 | gnorm 0.354 | clip 0 | loss_scale 4 | train_wall 1568 | gb_free 23.8 | wall 8180 epoch 005 | loss 4.591 | nll_loss 3.009 | ppl 8.05 | wps 447513 | ups 1.03 | wpb 433530 | bsz 16505.8 | num_updates 8432 | lr 0.000688755 | gnorm 0.354 | clip 0 | loss_scale 4 | train_wall 1568 | gb_free 23.8 | wall 8180 Start iterating over samples epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 68 / 1689 loss=4.534, nll_loss=2.946, ppl=7.71, wps=447916, ups=1.04, wpb=430246, bsz=16081.4, num_updates=8500, lr=0.000685994, gnorm=0.347, clip=0, loss_scale=4, train_wall=94, gb_free=19.3, wall=8247 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 169 / 1689 loss=4.54, nll_loss=2.954, ppl=7.75, wps=457568, ups=1.05, wpb=434829, bsz=16290.2, num_updates=8600, lr=0.000681994, gnorm=0.345, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=8342 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 269 / 1689 loss=4.528, nll_loss=2.94, ppl=7.67, wps=458495, ups=1.06, wpb=433005, bsz=16685.1, num_updates=8700, lr=0.000678064, gnorm=0.338, clip=0, loss_scale=4, train_wall=94, gb_free=18.5, wall=8436 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 369 / 1689 loss=4.537, nll_loss=2.95, ppl=7.73, wps=458357, ups=1.06, wpb=433430, bsz=16338.6, num_updates=8800, lr=0.0006742, gnorm=0.332, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=8531 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 469 / 1689 loss=4.53, nll_loss=2.943, ppl=7.69, wps=456547, ups=1.06, wpb=432280, bsz=16418, num_updates=8900, lr=0.000670402, gnorm=0.349, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 epoch 006: 569 / 1689 loss=4.528, nll_loss=2.941, ppl=7.68, wps=459450, ups=1.06, wpb=435043, bsz=16600.3, num_updates=9000, lr=0.000666667, gnorm=0.348, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=8720 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006 | valid on 'valid' subset | loss 4.443 | nll_loss 2.796 | ppl 6.94 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.443 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 670 / 1689 loss=4.527, nll_loss=2.94, ppl=7.68, wps=374241, ups=0.87, wpb=432147, bsz=16858.6, num_updates=9100, lr=0.000662994, gnorm=0.331, clip=0, loss_scale=4, train_wall=95, gb_free=17.9, wall=8836 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 771 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=452333, ups=1.04, wpb=433510, bsz=16406.5, num_updates=9200, lr=0.00065938, gnorm=0.339, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=8932 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 871 / 1689 loss=4.537, nll_loss=2.952, ppl=7.74, wps=457453, ups=1.05, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.34, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=9027 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 971 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=461842, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.325, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9121 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1071 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=455374, ups=1.05, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.34, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=9216 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1171 / 1689 loss=4.519, nll_loss=2.932, ppl=7.63, wps=457786, ups=1.05, wpb=434620, bsz=16664.2, num_updates=9600, lr=0.000645497, gnorm=0.336, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=9311 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1271 / 1689 loss=4.511, nll_loss=2.923, ppl=7.58, wps=458893, ups=1.06, wpb=433171, bsz=16261.9, num_updates=9700, lr=0.000642161, gnorm=0.331, clip=0, loss_scale=4, train_wall=93, gb_free=15.9, wall=9405 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1372 / 1689 loss=4.518, nll_loss=2.931, ppl=7.63, wps=454304, ups=1.05, wpb=432725, bsz=16620.9, num_updates=9800, lr=0.000638877, gnorm=0.338, clip=0, loss_scale=2, train_wall=94, gb_free=18.5, wall=9500 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1472 / 1689 loss=4.521, nll_loss=2.935, ppl=7.65, wps=463079, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.322, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=9594 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 epoch 006: 1572 / 1689 loss=4.514, nll_loss=2.927, ppl=7.61, wps=458846, ups=1.05, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.32, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=9689 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006 | valid on 'valid' subset | loss 4.416 | nll_loss 2.768 | ppl 6.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.416 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 epoch 006: 1672 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=331734, ups=0.77, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.333, clip=0, loss_scale=2, train_wall=95, gb_free=17.1, wall=9820 end of epoch 6 (average epoch stats below) epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 epoch 006 | loss 4.523 | nll_loss 2.936 | ppl 7.65 | wps 441539 | ups 1.02 | wpb 433528 | bsz 16503.8 | num_updates 10117 | lr 0.000628788 | gnorm 0.336 | clip 0 | loss_scale 2 | train_wall 1577 | gb_free 22.3 | wall 9835 Start iterating over samples epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 83 / 1689 loss=4.482, nll_loss=2.89, ppl=7.41, wps=458242, ups=1.07, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.33, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=9913 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 183 / 1689 loss=4.473, nll_loss=2.88, ppl=7.36, wps=461052, ups=1.06, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=10008 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 283 / 1689 loss=4.478, nll_loss=2.886, ppl=7.39, wps=458591, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.307, clip=0, loss_scale=4, train_wall=94, gb_free=19, wall=10102 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 384 / 1689 loss=4.486, nll_loss=2.896, ppl=7.44, wps=457365, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.321, clip=0, loss_scale=2, train_wall=94, gb_free=17.7, wall=10197 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 484 / 1689 loss=4.476, nll_loss=2.885, ppl=7.38, wps=459662, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.319, clip=0, loss_scale=2, train_wall=93, gb_free=17.2, wall=10292 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 584 / 1689 loss=4.469, nll_loss=2.877, ppl=7.34, wps=456982, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.324, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=10386 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 684 / 1689 loss=4.472, nll_loss=2.88, ppl=7.36, wps=456725, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.311, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=10481 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 784 / 1689 loss=4.487, nll_loss=2.897, ppl=7.45, wps=459370, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.316, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=10576 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 epoch 007: 884 / 1689 loss=4.485, nll_loss=2.895, ppl=7.44, wps=462266, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=10669 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007 | valid on 'valid' subset | loss 4.405 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.405 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 984 / 1689 loss=4.469, nll_loss=2.877, ppl=7.35, wps=298441, ups=0.69, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.339, clip=0, loss_scale=4, train_wall=119, gb_free=19.1, wall=10815 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1085 / 1689 loss=4.462, nll_loss=2.87, ppl=7.31, wps=452514, ups=1.05, wpb=432268, bsz=16995.7, num_updates=11200, lr=0.000597614, gnorm=0.324, clip=0, loss_scale=2, train_wall=94, gb_free=16.9, wall=10910 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1185 / 1689 loss=4.471, nll_loss=2.881, ppl=7.36, wps=460347, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=17.3, wall=11005 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1285 / 1689 loss=4.472, nll_loss=2.881, ppl=7.37, wps=463729, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.317, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=11099 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1385 / 1689 loss=4.48, nll_loss=2.891, ppl=7.42, wps=462794, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11192 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1485 / 1689 loss=4.477, nll_loss=2.888, ppl=7.4, wps=461474, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=11287 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1585 / 1689 loss=4.469, nll_loss=2.878, ppl=7.35, wps=458300, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.319, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=11381 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 epoch 007: 1685 / 1689 loss=4.463, nll_loss=2.872, ppl=7.32, wps=455816, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.295, clip=0, loss_scale=4, train_wall=93, gb_free=20.2, wall=11476 end of epoch 7 (average epoch stats below) epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 epoch 007 | loss 4.474 | nll_loss 2.883 | ppl 7.38 | wps 444756 | ups 1.03 | wpb 433525 | bsz 16505.5 | num_updates 11804 | lr 0.000582124 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 1594 | gb_free 19.3 | wall 11479 Start iterating over samples epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 96 / 1689 loss=4.427, nll_loss=2.83, ppl=7.11, wps=457364, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.315, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=11570 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 epoch 008: 197 / 1689 loss=4.443, nll_loss=2.848, ppl=7.2, wps=457978, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11665 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008 | valid on 'valid' subset | loss 4.386 | nll_loss 2.739 | ppl 6.67 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.386 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 297 / 1689 loss=4.444, nll_loss=2.85, ppl=7.21, wps=377119, ups=0.87, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.308, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=11780 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 397 / 1689 loss=4.452, nll_loss=2.859, ppl=7.25, wps=457837, ups=1.05, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.305, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=11875 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 497 / 1689 loss=4.431, nll_loss=2.834, ppl=7.13, wps=459454, ups=1.07, wpb=431064, bsz=16068.6, num_updates=12300, lr=0.000570266, gnorm=0.309, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=11969 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 597 / 1689 loss=4.441, nll_loss=2.846, ppl=7.19, wps=456175, ups=1.06, wpb=430214, bsz=16383.2, num_updates=12400, lr=0.000567962, gnorm=0.318, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=12063 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 697 / 1689 loss=4.43, nll_loss=2.834, ppl=7.13, wps=457882, ups=1.06, wpb=433334, bsz=16592.1, num_updates=12500, lr=0.000565685, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=12158 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 797 / 1689 loss=4.438, nll_loss=2.843, ppl=7.18, wps=459961, ups=1.06, wpb=434403, bsz=16391.2, num_updates=12600, lr=0.000563436, gnorm=0.285, clip=0, loss_scale=4, train_wall=92, gb_free=18.3, wall=12252 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 897 / 1689 loss=4.452, nll_loss=2.859, ppl=7.26, wps=462814, ups=1.06, wpb=434982, bsz=16631.7, num_updates=12700, lr=0.000561214, gnorm=0.307, clip=0, loss_scale=4, train_wall=92, gb_free=19.2, wall=12346 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 997 / 1689 loss=4.453, nll_loss=2.861, ppl=7.26, wps=464476, ups=1.06, wpb=436567, bsz=16335.7, num_updates=12800, lr=0.000559017, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=12440 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1097 / 1689 loss=4.433, nll_loss=2.838, ppl=7.15, wps=462202, ups=1.06, wpb=434259, bsz=16427, num_updates=12900, lr=0.000556846, gnorm=0.302, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=12534 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 epoch 008: 1197 / 1689 loss=4.43, nll_loss=2.836, ppl=7.14, wps=461577, ups=1.06, wpb=434024, bsz=16292.1, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=12628 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008 | valid on 'valid' subset | loss 4.371 | nll_loss 2.72 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.371 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1299 / 1689 loss=4.441, nll_loss=2.847, ppl=7.2, wps=376925, ups=0.87, wpb=434988, bsz=16645.7, num_updates=13100, lr=0.000552579, gnorm=0.287, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=12744 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1399 / 1689 loss=4.436, nll_loss=2.843, ppl=7.17, wps=458519, ups=1.06, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.294, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=12838 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1499 / 1689 loss=4.429, nll_loss=2.834, ppl=7.13, wps=460378, ups=1.06, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.301, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=12932 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 epoch 008: 1599 / 1689 loss=4.429, nll_loss=2.835, ppl=7.13, wps=460396, ups=1.07, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=13026 end of epoch 8 (average epoch stats below) epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 epoch 008 | loss 4.438 | nll_loss 2.843 | ppl 7.18 | wps 448262 | ups 1.03 | wpb 433538 | bsz 16502.5 | num_updates 13490 | lr 0.000544533 | gnorm 0.299 | clip 0 | loss_scale 2 | train_wall 1561 | gb_free 20.1 | wall 13110 Start iterating over samples epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 10 / 1689 loss=4.425, nll_loss=2.83, ppl=7.11, wps=460216, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=13120 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 111 / 1689 loss=4.396, nll_loss=2.796, ppl=6.94, wps=454379, ups=1.05, wpb=431336, bsz=16573.1, num_updates=13600, lr=0.000542326, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13215 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 211 / 1689 loss=4.41, nll_loss=2.813, ppl=7.03, wps=457820, ups=1.05, wpb=434171, bsz=16645.9, num_updates=13700, lr=0.000540343, gnorm=0.302, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13309 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 311 / 1689 loss=4.416, nll_loss=2.82, ppl=7.06, wps=455982, ups=1.05, wpb=435037, bsz=16586.6, num_updates=13800, lr=0.000538382, gnorm=0.291, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=13405 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 411 / 1689 loss=4.407, nll_loss=2.809, ppl=7.01, wps=458436, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13500 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 epoch 009: 511 / 1689 loss=4.409, nll_loss=2.812, ppl=7.02, wps=457634, ups=1.06, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.303, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=13594 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009 | valid on 'valid' subset | loss 4.354 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.354 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 611 / 1689 loss=4.417, nll_loss=2.821, ppl=7.06, wps=308949, ups=0.71, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.297, clip=0, loss_scale=4, train_wall=91, gb_free=18.9, wall=13735 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 711 / 1689 loss=4.421, nll_loss=2.826, ppl=7.09, wps=466778, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=20.3, wall=13828 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 812 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=461649, ups=1.07, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=13922 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 912 / 1689 loss=4.405, nll_loss=2.808, ppl=7, wps=463454, ups=1.07, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.286, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14016 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1012 / 1689 loss=4.396, nll_loss=2.797, ppl=6.95, wps=462789, ups=1.07, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.274, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14109 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1112 / 1689 loss=4.414, nll_loss=2.818, ppl=7.05, wps=464544, ups=1.07, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.279, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=14203 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1212 / 1689 loss=4.398, nll_loss=2.8, ppl=6.96, wps=458796, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=14297 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1312 / 1689 loss=4.404, nll_loss=2.807, ppl=7, wps=456317, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.278, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=14392 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1412 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=456992, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.28, clip=0, loss_scale=4, train_wall=93, gb_free=17.2, wall=14486 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 epoch 009: 1513 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=454068, ups=1.05, wpb=433525, bsz=16572.6, num_updates=15000, lr=0.000516398, gnorm=0.277, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=14582 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009 | valid on 'valid' subset | loss 4.343 | nll_loss 2.693 | ppl 6.47 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.343 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 epoch 009: 1613 / 1689 loss=4.418, nll_loss=2.823, ppl=7.08, wps=332374, ups=0.76, wpb=434641, bsz=16613, num_updates=15100, lr=0.000514685, gnorm=0.285, clip=0, loss_scale=2, train_wall=107, gb_free=18.7, wall=14713 end of epoch 9 (average epoch stats below) epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 epoch 009 | loss 4.408 | nll_loss 2.811 | ppl 7.02 | wps 436708 | ups 1.01 | wpb 433543 | bsz 16504.5 | num_updates 15176 | lr 0.000513395 | gnorm 0.286 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.2 | wall 14783 Start iterating over samples epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 24 / 1689 loss=4.414, nll_loss=2.819, ppl=7.06, wps=456630, ups=1.06, wpb=431550, bsz=16670.5, num_updates=15200, lr=0.000512989, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=17.5, wall=14807 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 124 / 1689 loss=4.385, nll_loss=2.784, ppl=6.89, wps=460316, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=14902 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 224 / 1689 loss=4.376, nll_loss=2.775, ppl=6.84, wps=458308, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=14996 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 325 / 1689 loss=4.382, nll_loss=2.781, ppl=6.87, wps=455754, ups=1.05, wpb=434026, bsz=16464.6, num_updates=15500, lr=0.000508001, gnorm=0.285, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=15091 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 425 / 1689 loss=4.379, nll_loss=2.779, ppl=6.86, wps=462400, ups=1.06, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.283, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=15185 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 525 / 1689 loss=4.382, nll_loss=2.782, ppl=6.88, wps=457021, ups=1.06, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=15280 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 625 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=455575, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.293, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=15375 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 725 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=456318, ups=1.05, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.269, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=15470 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 epoch 010: 825 / 1689 loss=4.38, nll_loss=2.78, ppl=6.87, wps=458808, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=15565 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010 | valid on 'valid' subset | loss 4.358 | nll_loss 2.707 | ppl 6.53 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.343 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 925 / 1689 loss=4.391, nll_loss=2.793, ppl=6.93, wps=408000, ups=0.94, wpb=435804, bsz=16754.2, num_updates=16100, lr=0.000498445, gnorm=0.268, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=15671 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1026 / 1689 loss=4.378, nll_loss=2.779, ppl=6.86, wps=453824, ups=1.05, wpb=433758, bsz=16602.2, num_updates=16200, lr=0.000496904, gnorm=0.282, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=15767 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1126 / 1689 loss=4.378, nll_loss=2.778, ppl=6.86, wps=459267, ups=1.06, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.261, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=15861 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1226 / 1689 loss=4.386, nll_loss=2.787, ppl=6.9, wps=463054, ups=1.07, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=15955 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1326 / 1689 loss=4.375, nll_loss=2.776, ppl=6.85, wps=458192, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.278, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=16049 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1426 / 1689 loss=4.398, nll_loss=2.801, ppl=6.97, wps=457564, ups=1.05, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.273, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16144 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1527 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=452517, ups=1.05, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=16240 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 epoch 010: 1627 / 1689 loss=4.392, nll_loss=2.795, ppl=6.94, wps=458362, ups=1.05, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=16335 end of epoch 10 (average epoch stats below) epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 epoch 010 | loss 4.384 | nll_loss 2.784 | ppl 6.89 | wps 454356 | ups 1.05 | wpb 433552 | bsz 16502.8 | num_updates 16862 | lr 0.000487052 | gnorm 0.275 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 21 | wall 16392 Start iterating over samples epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 38 / 1689 loss=4.376, nll_loss=2.776, ppl=6.85, wps=453680, ups=1.05, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=16430 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 epoch 011: 138 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=457602, ups=1.05, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.266, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=16525 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011 | valid on 'valid' subset | loss 4.339 | nll_loss 2.685 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.339 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 238 / 1689 loss=4.354, nll_loss=2.751, ppl=6.73, wps=381082, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=16638 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 338 / 1689 loss=4.357, nll_loss=2.755, ppl=6.75, wps=458700, ups=1.06, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.26, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=16732 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 438 / 1689 loss=4.368, nll_loss=2.767, ppl=6.81, wps=459795, ups=1.06, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=16827 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 539 / 1689 loss=4.369, nll_loss=2.768, ppl=6.81, wps=455019, ups=1.04, wpb=436159, bsz=16577.4, num_updates=17400, lr=0.000479463, gnorm=0.267, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=16923 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 639 / 1689 loss=4.364, nll_loss=2.763, ppl=6.79, wps=463312, ups=1.06, wpb=435538, bsz=16281, num_updates=17500, lr=0.000478091, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17017 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 739 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=458262, ups=1.06, wpb=433357, bsz=16413.3, num_updates=17600, lr=0.000476731, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=17111 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 839 / 1689 loss=4.359, nll_loss=2.757, ppl=6.76, wps=459476, ups=1.06, wpb=434803, bsz=16688.9, num_updates=17700, lr=0.000475383, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=17206 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 939 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=456159, ups=1.06, wpb=430852, bsz=16450, num_updates=17800, lr=0.000474045, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21, wall=17301 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1039 / 1689 loss=4.376, nll_loss=2.777, ppl=6.85, wps=457561, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=17395 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 epoch 011: 1140 / 1689 loss=4.361, nll_loss=2.76, ppl=6.77, wps=453764, ups=1.05, wpb=433602, bsz=16598.4, num_updates=18000, lr=0.000471405, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=18.8, wall=17491 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011 | valid on 'valid' subset | loss 4.333 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.333 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1240 / 1689 loss=4.353, nll_loss=2.751, ppl=6.73, wps=306619, ups=0.71, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.261, clip=0, loss_scale=2, train_wall=104, gb_free=19.7, wall=17632 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1340 / 1689 loss=4.36, nll_loss=2.758, ppl=6.77, wps=467740, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.274, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=17725 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1440 / 1689 loss=4.369, nll_loss=2.769, ppl=6.82, wps=463441, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.272, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=17818 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1540 / 1689 loss=4.366, nll_loss=2.766, ppl=6.8, wps=462238, ups=1.06, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.264, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=17912 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 epoch 011: 1640 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=461600, ups=1.07, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=93, gb_free=18.5, wall=18006 end of epoch 11 (average epoch stats below) epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 epoch 011 | loss 4.363 | nll_loss 2.762 | ppl 6.78 | wps 440425 | ups 1.02 | wpb 433518 | bsz 16503.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.266 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 20.7 | wall 18052 Start iterating over samples epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 52 / 1689 loss=4.35, nll_loss=2.747, ppl=6.71, wps=451335, ups=1.05, wpb=429405, bsz=16311.5, num_updates=18600, lr=0.000463739, gnorm=0.27, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=18101 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 152 / 1689 loss=4.335, nll_loss=2.729, ppl=6.63, wps=462841, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=18195 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 252 / 1689 loss=4.337, nll_loss=2.733, ppl=6.65, wps=464611, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18288 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 352 / 1689 loss=4.343, nll_loss=2.739, ppl=6.68, wps=459678, ups=1.06, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=18382 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 epoch 012: 452 / 1689 loss=4.347, nll_loss=2.744, ppl=6.7, wps=459633, ups=1.06, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.263, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=18477 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.321 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.321 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 553 / 1689 loss=4.342, nll_loss=2.738, ppl=6.67, wps=376641, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=18591 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 653 / 1689 loss=4.351, nll_loss=2.749, ppl=6.72, wps=462518, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=18685 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 753 / 1689 loss=4.346, nll_loss=2.743, ppl=6.69, wps=462232, ups=1.07, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=18779 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 853 / 1689 loss=4.343, nll_loss=2.741, ppl=6.68, wps=462588, ups=1.07, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=18872 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 953 / 1689 loss=4.339, nll_loss=2.736, ppl=6.66, wps=461192, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=21.1, wall=18967 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1053 / 1689 loss=4.34, nll_loss=2.737, ppl=6.67, wps=458843, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=19062 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1153 / 1689 loss=4.346, nll_loss=2.744, ppl=6.7, wps=457072, ups=1.05, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.258, clip=0, loss_scale=4, train_wall=93, gb_free=18.2, wall=19157 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1254 / 1689 loss=4.353, nll_loss=2.752, ppl=6.74, wps=454107, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.252, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=19252 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1354 / 1689 loss=4.35, nll_loss=2.749, ppl=6.72, wps=461564, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=21.4, wall=19346 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 epoch 012: 1454 / 1689 loss=4.358, nll_loss=2.757, ppl=6.76, wps=459502, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.261, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19441 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012 | valid on 'valid' subset | loss 4.333 | nll_loss 2.687 | ppl 6.44 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.321 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1554 / 1689 loss=4.357, nll_loss=2.756, ppl=6.76, wps=315154, ups=0.73, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.246, clip=0, loss_scale=2, train_wall=120, gb_free=18.8, wall=19579 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 epoch 012: 1654 / 1689 loss=4.351, nll_loss=2.75, ppl=6.73, wps=462644, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=19673 end of epoch 12 (average epoch stats below) epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 epoch 012 | loss 4.346 | nll_loss 2.743 | ppl 6.69 | wps 442512 | ups 1.02 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.256 | clip 0 | loss_scale 4 | train_wall 1590 | gb_free 20.6 | wall 19705 Start iterating over samples epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 65 / 1689 loss=4.324, nll_loss=2.718, ppl=6.58, wps=456818, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.257, clip=0, loss_scale=4, train_wall=92, gb_free=18.5, wall=19767 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 166 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=455495, ups=1.05, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=19862 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 267 / 1689 loss=4.326, nll_loss=2.721, ppl=6.59, wps=453182, ups=1.04, wpb=435071, bsz=16323.8, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=19958 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 367 / 1689 loss=4.318, nll_loss=2.711, ppl=6.55, wps=455553, ups=1.05, wpb=433575, bsz=16748, num_updates=20600, lr=0.000440653, gnorm=0.249, clip=0, loss_scale=1, train_wall=94, gb_free=18.2, wall=20053 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 467 / 1689 loss=4.316, nll_loss=2.71, ppl=6.54, wps=461403, ups=1.07, wpb=432150, bsz=16644, num_updates=20700, lr=0.000439587, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=20147 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 567 / 1689 loss=4.334, nll_loss=2.73, ppl=6.63, wps=458115, ups=1.06, wpb=433669, bsz=16082.9, num_updates=20800, lr=0.000438529, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=20241 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 667 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=459814, ups=1.06, wpb=435178, bsz=17088.6, num_updates=20900, lr=0.000437479, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20336 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 epoch 013: 767 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=459249, ups=1.06, wpb=432766, bsz=16488.7, num_updates=21000, lr=0.000436436, gnorm=0.251, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=20430 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013 | valid on 'valid' subset | loss 4.306 | nll_loss 2.656 | ppl 6.3 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.306 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 867 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=377072, ups=0.87, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20546 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 967 / 1689 loss=4.332, nll_loss=2.728, ppl=6.62, wps=458550, ups=1.06, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=20640 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1067 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=462398, ups=1.06, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=20734 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1167 / 1689 loss=4.326, nll_loss=2.722, ppl=6.6, wps=462592, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.259, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=20828 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1267 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=463496, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.265, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=20922 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1368 / 1689 loss=4.342, nll_loss=2.74, ppl=6.68, wps=452668, ups=1.04, wpb=433377, bsz=16785.1, num_updates=21600, lr=0.000430331, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21017 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1468 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458960, ups=1.06, wpb=433332, bsz=16598.6, num_updates=21700, lr=0.000429339, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=21112 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1568 / 1689 loss=4.334, nll_loss=2.731, ppl=6.64, wps=458223, ups=1.06, wpb=432768, bsz=16601.2, num_updates=21800, lr=0.000428353, gnorm=0.259, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=21206 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 epoch 013: 1668 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=456062, ups=1.05, wpb=433066, bsz=16403.4, num_updates=21900, lr=0.000427374, gnorm=0.265, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=21301 end of epoch 13 (average epoch stats below) epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 epoch 013 | loss 4.33 | nll_loss 2.726 | ppl 6.61 | wps 452450 | ups 1.04 | wpb 433534 | bsz 16502.3 | num_updates 21921 | lr 0.000427169 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 19.7 | wall 21320 Start iterating over samples epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 epoch 014: 79 / 1689 loss=4.309, nll_loss=2.701, ppl=6.5, wps=444663, ups=1.04, wpb=428841, bsz=16236.9, num_updates=22000, lr=0.000426401, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=21398 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.304 | nll_loss 2.657 | ppl 6.31 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.304 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 180 / 1689 loss=4.317, nll_loss=2.711, ppl=6.55, wps=376758, ups=0.87, wpb=434727, bsz=16483.1, num_updates=22100, lr=0.000425436, gnorm=0.249, clip=0, loss_scale=2, train_wall=96, gb_free=17.9, wall=21513 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 280 / 1689 loss=4.307, nll_loss=2.699, ppl=6.49, wps=464095, ups=1.07, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=21606 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 380 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460823, ups=1.06, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=21700 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 480 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=462142, ups=1.06, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.255, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=21795 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 580 / 1689 loss=4.311, nll_loss=2.704, ppl=6.52, wps=461928, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=17.7, wall=21889 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 681 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=456303, ups=1.05, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=18.7, wall=21984 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 781 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=459691, ups=1.05, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.256, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=22078 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 881 / 1689 loss=4.32, nll_loss=2.715, ppl=6.57, wps=457518, ups=1.05, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.244, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=22173 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 981 / 1689 loss=4.307, nll_loss=2.701, ppl=6.5, wps=453334, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=22268 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 epoch 014: 1081 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=457135, ups=1.05, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.237, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=22363 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014 | valid on 'valid' subset | loss 4.311 | nll_loss 2.665 | ppl 6.34 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.304 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1181 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=327789, ups=0.76, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.239, clip=0, loss_scale=4, train_wall=91, gb_free=21.4, wall=22495 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1281 / 1689 loss=4.318, nll_loss=2.713, ppl=6.56, wps=471182, ups=1.08, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.236, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=22587 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1382 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=463129, ups=1.07, wpb=434858, bsz=16258.9, num_updates=23300, lr=0.000414335, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22681 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1482 / 1689 loss=4.324, nll_loss=2.72, ppl=6.59, wps=466821, ups=1.07, wpb=435067, bsz=16606.6, num_updates=23400, lr=0.000413449, gnorm=0.249, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=22774 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1582 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=463469, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.239, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=22868 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 epoch 014: 1682 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460512, ups=1.07, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=22962 end of epoch 14 (average epoch stats below) epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 epoch 014 | loss 4.316 | nll_loss 2.711 | ppl 6.55 | wps 443709 | ups 1.02 | wpb 433535 | bsz 16503.1 | num_updates 23607 | lr 0.000411632 | gnorm 0.244 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 21.6 | wall 22967 Start iterating over samples epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 93 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=451052, ups=1.05, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.1, wall=23057 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 193 / 1689 loss=4.29, nll_loss=2.681, ppl=6.41, wps=459331, ups=1.06, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=23152 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 294 / 1689 loss=4.3, nll_loss=2.692, ppl=6.46, wps=454273, ups=1.05, wpb=434397, bsz=16515, num_updates=23900, lr=0.000409101, gnorm=0.245, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23247 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 epoch 015: 394 / 1689 loss=4.311, nll_loss=2.705, ppl=6.52, wps=460146, ups=1.06, wpb=434360, bsz=16809.8, num_updates=24000, lr=0.000408248, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=23342 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015 | valid on 'valid' subset | loss 4.302 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.302 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 494 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=383377, ups=0.88, wpb=435708, bsz=16799.8, num_updates=24100, lr=0.0004074, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=23455 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 594 / 1689 loss=4.295, nll_loss=2.687, ppl=6.44, wps=459121, ups=1.06, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.246, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=23549 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 694 / 1689 loss=4.301, nll_loss=2.694, ppl=6.47, wps=457217, ups=1.05, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.242, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=23644 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 795 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=453523, ups=1.05, wpb=431483, bsz=16217.6, num_updates=24400, lr=0.000404888, gnorm=0.243, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=23740 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 895 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=461533, ups=1.06, wpb=434481, bsz=16240.3, num_updates=24500, lr=0.000404061, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=23834 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 995 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=459919, ups=1.06, wpb=432950, bsz=16235, num_updates=24600, lr=0.000403239, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=23928 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1095 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=457967, ups=1.06, wpb=433599, bsz=16766.8, num_updates=24700, lr=0.000402422, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=24022 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1195 / 1689 loss=4.315, nll_loss=2.71, ppl=6.54, wps=460133, ups=1.06, wpb=434142, bsz=16227.2, num_updates=24800, lr=0.00040161, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=24117 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1295 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=458409, ups=1.06, wpb=431848, bsz=16995, num_updates=24900, lr=0.000400802, gnorm=0.235, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=24211 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 epoch 015: 1396 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=457481, ups=1.06, wpb=432490, bsz=16509.2, num_updates=25000, lr=0.0004, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=24306 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015 | valid on 'valid' subset | loss 4.298 | nll_loss 2.652 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.298 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1496 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=330919, ups=0.76, wpb=435441, bsz=16764.6, num_updates=25100, lr=0.000399202, gnorm=0.244, clip=0, loss_scale=2, train_wall=96, gb_free=18.8, wall=24437 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 epoch 015: 1596 / 1689 loss=4.308, nll_loss=2.702, ppl=6.51, wps=467463, ups=1.08, wpb=434756, bsz=16560.2, num_updates=25200, lr=0.00039841, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24530 end of epoch 15 (average epoch stats below) epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 epoch 015 | loss 4.304 | nll_loss 2.697 | ppl 6.49 | wps 443384 | ups 1.02 | wpb 433540 | bsz 16503.9 | num_updates 25293 | lr 0.000397676 | gnorm 0.242 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.4 | wall 24616 Start iterating over samples epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 7 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=461914, ups=1.07, wpb=430544, bsz=15992.9, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=18.4, wall=24623 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 107 / 1689 loss=4.283, nll_loss=2.673, ppl=6.38, wps=457040, ups=1.06, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.231, clip=0, loss_scale=2, train_wall=94, gb_free=18.4, wall=24718 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 207 / 1689 loss=4.292, nll_loss=2.683, ppl=6.42, wps=459990, ups=1.06, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.22, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=24812 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 308 / 1689 loss=4.285, nll_loss=2.676, ppl=6.39, wps=459834, ups=1.06, wpb=435315, bsz=16241.1, num_updates=25600, lr=0.000395285, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=19.6, wall=24907 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 408 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=466799, ups=1.07, wpb=435573, bsz=16038.9, num_updates=25700, lr=0.000394515, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=25000 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 508 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458762, ups=1.06, wpb=433152, bsz=16166.9, num_updates=25800, lr=0.00039375, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25095 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 608 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=457418, ups=1.05, wpb=434241, bsz=16501.8, num_updates=25900, lr=0.000392989, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=25190 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 epoch 016: 708 / 1689 loss=4.291, nll_loss=2.683, ppl=6.42, wps=460632, ups=1.06, wpb=433472, bsz=16516.6, num_updates=26000, lr=0.000392232, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25284 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016 | valid on 'valid' subset | loss 4.29 | nll_loss 2.649 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.29 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 808 / 1689 loss=4.283, nll_loss=2.674, ppl=6.38, wps=375757, ups=0.87, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=25399 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 909 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=451809, ups=1.04, wpb=434266, bsz=16717.8, num_updates=26200, lr=0.000390732, gnorm=0.235, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=25495 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1009 / 1689 loss=4.293, nll_loss=2.685, ppl=6.43, wps=458801, ups=1.06, wpb=434349, bsz=16338.9, num_updates=26300, lr=0.000389989, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=25590 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1109 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=457474, ups=1.06, wpb=433569, bsz=16555.8, num_updates=26400, lr=0.000389249, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=25684 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1209 / 1689 loss=4.296, nll_loss=2.69, ppl=6.45, wps=456904, ups=1.05, wpb=433841, bsz=17111.9, num_updates=26500, lr=0.000388514, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=25779 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1309 / 1689 loss=4.296, nll_loss=2.689, ppl=6.45, wps=457082, ups=1.06, wpb=432960, bsz=16723, num_updates=26600, lr=0.000387783, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=25874 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1410 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456031, ups=1.05, wpb=433228, bsz=16447, num_updates=26700, lr=0.000387056, gnorm=0.241, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=25969 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1510 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=461614, ups=1.07, wpb=433125, bsz=16555.5, num_updates=26800, lr=0.000386334, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=26063 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 epoch 016: 1610 / 1689 loss=4.303, nll_loss=2.698, ppl=6.49, wps=455967, ups=1.05, wpb=432279, bsz=16349.3, num_updates=26900, lr=0.000385615, gnorm=0.249, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=26158 end of epoch 16 (average epoch stats below) epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 epoch 016 | loss 4.293 | nll_loss 2.685 | ppl 6.43 | wps 452403 | ups 1.04 | wpb 433506 | bsz 16505.9 | num_updates 26979 | lr 0.00038505 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 19.9 | wall 26232 Start iterating over samples epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 epoch 017: 21 / 1689 loss=4.303, nll_loss=2.697, ppl=6.48, wps=456013, ups=1.05, wpb=432617, bsz=16624.9, num_updates=27000, lr=0.0003849, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26253 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017 | valid on 'valid' subset | loss 4.286 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.286 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 121 / 1689 loss=4.268, nll_loss=2.657, ppl=6.31, wps=384533, ups=0.88, wpb=436208, bsz=16638.2, num_updates=27100, lr=0.000384189, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=26366 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 221 / 1689 loss=4.272, nll_loss=2.661, ppl=6.32, wps=458469, ups=1.06, wpb=433129, bsz=16457, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=26460 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 322 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=455935, ups=1.06, wpb=431845, bsz=16537.4, num_updates=27300, lr=0.00038278, gnorm=0.234, clip=0, loss_scale=2, train_wall=94, gb_free=18.3, wall=26555 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 422 / 1689 loss=4.287, nll_loss=2.679, ppl=6.4, wps=467064, ups=1.07, wpb=435014, bsz=16541.4, num_updates=27400, lr=0.00038208, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=26648 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 523 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=459744, ups=1.06, wpb=434701, bsz=16291.6, num_updates=27500, lr=0.000381385, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=26743 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 623 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=460354, ups=1.06, wpb=433100, bsz=16241.3, num_updates=27600, lr=0.000380693, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=26837 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 723 / 1689 loss=4.279, nll_loss=2.67, ppl=6.36, wps=455552, ups=1.05, wpb=432285, bsz=16474.2, num_updates=27700, lr=0.000380006, gnorm=0.233, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26932 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 823 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=457543, ups=1.06, wpb=433244, bsz=17077.8, num_updates=27800, lr=0.000379322, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27027 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 923 / 1689 loss=4.299, nll_loss=2.692, ppl=6.46, wps=461676, ups=1.06, wpb=434142, bsz=16675.8, num_updates=27900, lr=0.000378641, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=27121 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 epoch 017: 1023 / 1689 loss=4.281, nll_loss=2.672, ppl=6.37, wps=461085, ups=1.06, wpb=433604, bsz=16325, num_updates=28000, lr=0.000377964, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=27215 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017 | valid on 'valid' subset | loss 4.285 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.285 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1123 / 1689 loss=4.291, nll_loss=2.684, ppl=6.43, wps=338098, ups=0.78, wpb=435278, bsz=16617.7, num_updates=28100, lr=0.000377291, gnorm=0.243, clip=0, loss_scale=2, train_wall=105, gb_free=21.2, wall=27343 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1223 / 1689 loss=4.284, nll_loss=2.676, ppl=6.39, wps=460559, ups=1.06, wpb=434222, bsz=16343.9, num_updates=28200, lr=0.000376622, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=27438 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1323 / 1689 loss=4.287, nll_loss=2.679, ppl=6.41, wps=456381, ups=1.06, wpb=431436, bsz=16284.1, num_updates=28300, lr=0.000375956, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=27532 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1423 / 1689 loss=4.277, nll_loss=2.668, ppl=6.36, wps=459634, ups=1.06, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.22, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=27626 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1523 / 1689 loss=4.292, nll_loss=2.685, ppl=6.43, wps=459443, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.24, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=27720 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 epoch 017: 1624 / 1689 loss=4.301, nll_loss=2.695, ppl=6.48, wps=457047, ups=1.05, wpb=435682, bsz=16675.1, num_updates=28600, lr=0.000373979, gnorm=0.236, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=27816 end of epoch 17 (average epoch stats below) epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 epoch 017 | loss 4.283 | nll_loss 2.674 | ppl 6.38 | wps 444402 | ups 1.03 | wpb 433537 | bsz 16504.3 | num_updates 28665 | lr 0.000373555 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 1581 | gb_free 19.6 | wall 27876 Start iterating over samples epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 35 / 1689 loss=4.276, nll_loss=2.666, ppl=6.35, wps=458094, ups=1.06, wpb=431460, bsz=16079.4, num_updates=28700, lr=0.000373327, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=21.5, wall=27910 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 135 / 1689 loss=4.262, nll_loss=2.65, ppl=6.28, wps=460350, ups=1.06, wpb=434772, bsz=16423.3, num_updates=28800, lr=0.000372678, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=28004 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 235 / 1689 loss=4.275, nll_loss=2.665, ppl=6.34, wps=461618, ups=1.06, wpb=436569, bsz=16461.8, num_updates=28900, lr=0.000372033, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=28099 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 epoch 018: 335 / 1689 loss=4.278, nll_loss=2.669, ppl=6.36, wps=459479, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=28193 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018 | valid on 'valid' subset | loss 4.283 | nll_loss 2.64 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.283 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 436 / 1689 loss=4.268, nll_loss=2.658, ppl=6.31, wps=293983, ups=0.68, wpb=433294, bsz=16705.9, num_updates=29100, lr=0.000370752, gnorm=0.243, clip=0, loss_scale=2, train_wall=108, gb_free=20.1, wall=28341 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 536 / 1689 loss=4.263, nll_loss=2.652, ppl=6.28, wps=467524, ups=1.08, wpb=431946, bsz=16529.5, num_updates=29200, lr=0.000370117, gnorm=0.237, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=28433 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 636 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=463133, ups=1.07, wpb=433963, bsz=16318.3, num_updates=29300, lr=0.000369484, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=28527 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 736 / 1689 loss=4.274, nll_loss=2.664, ppl=6.34, wps=461502, ups=1.06, wpb=434041, bsz=16616.8, num_updates=29400, lr=0.000368856, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=28621 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 836 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=463538, ups=1.07, wpb=434351, bsz=16864.2, num_updates=29500, lr=0.00036823, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=28714 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 936 / 1689 loss=4.28, nll_loss=2.671, ppl=6.37, wps=463091, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=18.1, wall=28808 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1036 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=458949, ups=1.06, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.229, clip=0, loss_scale=4, train_wall=94, gb_free=19.6, wall=28903 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1137 / 1689 loss=4.284, nll_loss=2.677, ppl=6.39, wps=458980, ups=1.06, wpb=434938, bsz=16460.7, num_updates=29800, lr=0.000366372, gnorm=0.236, clip=0, loss_scale=2, train_wall=94, gb_free=20.2, wall=28998 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1237 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=458146, ups=1.06, wpb=432579, bsz=16646.3, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=29092 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 epoch 018: 1337 / 1689 loss=4.28, nll_loss=2.672, ppl=6.37, wps=461088, ups=1.06, wpb=435839, bsz=16280.9, num_updates=30000, lr=0.000365148, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=29187 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018 | valid on 'valid' subset | loss 4.268 | nll_loss 2.622 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.268 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1437 / 1689 loss=4.276, nll_loss=2.667, ppl=6.35, wps=310410, ups=0.72, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=116, gb_free=19.7, wall=29326 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1537 / 1689 loss=4.266, nll_loss=2.656, ppl=6.3, wps=455163, ups=1.06, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29421 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 epoch 018: 1637 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=454796, ups=1.06, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=29516 end of epoch 18 (average epoch stats below) epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 epoch 018 | loss 4.273 | nll_loss 2.664 | ppl 6.34 | wps 433165 | ups 1 | wpb 433512 | bsz 16507.9 | num_updates 30351 | lr 0.000363031 | gnorm 0.234 | clip 0 | loss_scale 2 | train_wall 1604 | gb_free 21 | wall 29564 Start iterating over samples epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 49 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=446872, ups=1.04, wpb=431391, bsz=16524.9, num_updates=30400, lr=0.000362738, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.9, wall=29612 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 149 / 1689 loss=4.236, nll_loss=2.622, ppl=6.16, wps=456173, ups=1.06, wpb=431836, bsz=17002.6, num_updates=30500, lr=0.000362143, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=29707 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 249 / 1689 loss=4.253, nll_loss=2.641, ppl=6.24, wps=459416, ups=1.06, wpb=432659, bsz=16519.5, num_updates=30600, lr=0.000361551, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=29801 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 349 / 1689 loss=4.262, nll_loss=2.651, ppl=6.28, wps=455443, ups=1.05, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=29896 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 449 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=460742, ups=1.06, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=29990 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 549 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=460518, ups=1.06, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.239, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=30085 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 epoch 019: 651 / 1689 loss=4.266, nll_loss=2.655, ppl=6.3, wps=443698, ups=1.02, wpb=433406, bsz=16970.9, num_updates=31000, lr=0.000359211, gnorm=0.234, clip=0, loss_scale=1, train_wall=95, gb_free=19.9, wall=30182 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.629 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.268 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 751 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=345182, ups=0.8, wpb=431671, bsz=16695, num_updates=31100, lr=0.000358633, gnorm=0.221, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=30307 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 851 / 1689 loss=4.258, nll_loss=2.646, ppl=6.26, wps=471298, ups=1.08, wpb=436388, bsz=16387.8, num_updates=31200, lr=0.000358057, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=30400 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 951 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463392, ups=1.07, wpb=433832, bsz=16275.8, num_updates=31300, lr=0.000357485, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30494 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1051 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=460437, ups=1.06, wpb=432512, bsz=16361.9, num_updates=31400, lr=0.000356915, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=30588 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1152 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=459113, ups=1.05, wpb=435668, bsz=16298.6, num_updates=31500, lr=0.000356348, gnorm=0.242, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=30683 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1252 / 1689 loss=4.278, nll_loss=2.67, ppl=6.36, wps=462133, ups=1.06, wpb=434244, bsz=16763, num_updates=31600, lr=0.000355784, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30776 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1352 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=461487, ups=1.06, wpb=435488, bsz=16694.4, num_updates=31700, lr=0.000355222, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=30871 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1452 / 1689 loss=4.264, nll_loss=2.654, ppl=6.3, wps=459904, ups=1.07, wpb=431662, bsz=16146.3, num_updates=31800, lr=0.000354663, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=30965 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1552 / 1689 loss=4.272, nll_loss=2.663, ppl=6.33, wps=459971, ups=1.06, wpb=434265, bsz=16274, num_updates=31900, lr=0.000354107, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31059 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 epoch 019: 1652 / 1689 loss=4.277, nll_loss=2.669, ppl=6.36, wps=457983, ups=1.06, wpb=433951, bsz=16608.9, num_updates=32000, lr=0.000353553, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31154 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 epoch 019 | valid on 'valid' subset | loss 4.268 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.268 end of epoch 19 (average epoch stats below) epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 epoch 019 | loss 4.265 | nll_loss 2.655 | ppl 6.3 | wps 444548 | ups 1.03 | wpb 433531 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1567 | gb_free 20.3 | wall 31208 Start iterating over samples epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 63 / 1689 loss=4.25, nll_loss=2.637, ppl=6.22, wps=376006, ups=0.88, wpb=428948, bsz=16260.9, num_updates=32100, lr=0.000353002, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=31268 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 164 / 1689 loss=4.252, nll_loss=2.639, ppl=6.23, wps=456347, ups=1.05, wpb=433342, bsz=16334.3, num_updates=32200, lr=0.000352454, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=31363 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 264 / 1689 loss=4.249, nll_loss=2.636, ppl=6.22, wps=459793, ups=1.06, wpb=433308, bsz=16279.8, num_updates=32300, lr=0.000351908, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=31457 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 364 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=462391, ups=1.07, wpb=433577, bsz=16754, num_updates=32400, lr=0.000351364, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=31551 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 464 / 1689 loss=4.251, nll_loss=2.639, ppl=6.23, wps=459351, ups=1.06, wpb=433076, bsz=16657.1, num_updates=32500, lr=0.000350823, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=31645 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 564 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=462010, ups=1.06, wpb=435894, bsz=16792.5, num_updates=32600, lr=0.000350285, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=31740 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 664 / 1689 loss=4.261, nll_loss=2.65, ppl=6.28, wps=463463, ups=1.07, wpb=434771, bsz=16495.5, num_updates=32700, lr=0.000349749, gnorm=0.238, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=31833 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 764 / 1689 loss=4.262, nll_loss=2.652, ppl=6.29, wps=458389, ups=1.06, wpb=432938, bsz=16947.8, num_updates=32800, lr=0.000349215, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=31928 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 865 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=450864, ups=1.04, wpb=433529, bsz=16402.3, num_updates=32900, lr=0.000348684, gnorm=0.229, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32024 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 epoch 020: 965 / 1689 loss=4.25, nll_loss=2.638, ppl=6.23, wps=453628, ups=1.05, wpb=431438, bsz=16463.4, num_updates=33000, lr=0.000348155, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=32119 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020 | valid on 'valid' subset | loss 4.268 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.268 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1065 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=356060, ups=0.82, wpb=433306, bsz=16332.2, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=1, train_wall=100, gb_free=19, wall=32241 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1165 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=461988, ups=1.06, wpb=435682, bsz=16472.6, num_updates=33200, lr=0.000347105, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=32335 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1265 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=463253, ups=1.06, wpb=435137, bsz=16684.7, num_updates=33300, lr=0.000346583, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=17.9, wall=32429 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1365 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=464186, ups=1.07, wpb=432590, bsz=16311.7, num_updates=33400, lr=0.000346064, gnorm=0.216, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32522 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1465 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=458957, ups=1.06, wpb=432898, bsz=16470.8, num_updates=33500, lr=0.000345547, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=16.6, wall=32617 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1565 / 1689 loss=4.264, nll_loss=2.654, ppl=6.29, wps=458925, ups=1.06, wpb=431901, bsz=16274.2, num_updates=33600, lr=0.000345033, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=32711 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 epoch 020: 1665 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=463539, ups=1.06, wpb=436599, bsz=16644.7, num_updates=33700, lr=0.00034452, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=32805 end of epoch 20 (average epoch stats below) epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 epoch 020 | loss 4.257 | nll_loss 2.646 | ppl 6.26 | wps 451880 | ups 1.04 | wpb 433518 | bsz 16507.5 | num_updates 33724 | lr 0.000344398 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1575 | gb_free 19.6 | wall 32826 Start iterating over samples epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 76 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455505, ups=1.06, wpb=428676, bsz=16248.1, num_updates=33800, lr=0.00034401, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=32899 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 177 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458364, ups=1.06, wpb=433000, bsz=16544.9, num_updates=33900, lr=0.000343503, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=32993 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 epoch 021: 277 / 1689 loss=4.24, nll_loss=2.626, ppl=6.17, wps=458736, ups=1.06, wpb=432851, bsz=16585.4, num_updates=34000, lr=0.000342997, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=33088 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.253 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.253 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 377 / 1689 loss=4.244, nll_loss=2.631, ppl=6.19, wps=371134, ups=0.86, wpb=433025, bsz=16788.2, num_updates=34100, lr=0.000342494, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=20.5, wall=33204 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 477 / 1689 loss=4.252, nll_loss=2.64, ppl=6.23, wps=454098, ups=1.05, wpb=431522, bsz=16642.7, num_updates=34200, lr=0.000341993, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=33300 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 577 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=464003, ups=1.06, wpb=437667, bsz=16534.7, num_updates=34300, lr=0.000341494, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=33394 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 678 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=452634, ups=1.05, wpb=432836, bsz=16349.3, num_updates=34400, lr=0.000340997, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=33489 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 778 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457191, ups=1.06, wpb=433255, bsz=16609.9, num_updates=34500, lr=0.000340503, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=33584 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 878 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=458879, ups=1.06, wpb=433422, bsz=16329.4, num_updates=34600, lr=0.00034001, gnorm=0.237, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=33679 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 978 / 1689 loss=4.25, nll_loss=2.638, ppl=6.22, wps=458223, ups=1.06, wpb=432998, bsz=16410.2, num_updates=34700, lr=0.00033952, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=33773 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1078 / 1689 loss=4.245, nll_loss=2.633, ppl=6.2, wps=458704, ups=1.05, wpb=435949, bsz=16365.7, num_updates=34800, lr=0.000339032, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=20.3, wall=33868 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1178 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=455946, ups=1.05, wpb=433670, bsz=16761.5, num_updates=34900, lr=0.000338546, gnorm=0.23, clip=0, loss_scale=2, train_wall=94, gb_free=19.9, wall=33963 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 epoch 021: 1278 / 1689 loss=4.26, nll_loss=2.65, ppl=6.28, wps=460243, ups=1.06, wpb=435449, bsz=16595.2, num_updates=35000, lr=0.000338062, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=34058 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021 | valid on 'valid' subset | loss 4.256 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.253 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1378 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=408229, ups=0.94, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.217, clip=0, loss_scale=2, train_wall=94, gb_free=16.7, wall=34164 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1478 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=459503, ups=1.06, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=34258 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1578 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=462729, ups=1.07, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.232, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=34352 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 epoch 021: 1678 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=459904, ups=1.06, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.241, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=34447 end of epoch 21 (average epoch stats below) epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 epoch 021 | loss 4.25 | nll_loss 2.638 | ppl 6.23 | wps 448464 | ups 1.03 | wpb 433535 | bsz 16505.1 | num_updates 35410 | lr 0.000336099 | gnorm 0.227 | clip 0 | loss_scale 2 | train_wall 1574 | gb_free 18.2 | wall 34456 Start iterating over samples epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 91 / 1689 loss=4.224, nll_loss=2.608, ppl=6.1, wps=441540, ups=1.02, wpb=430934, bsz=16355, num_updates=35500, lr=0.000335673, gnorm=0.226, clip=0, loss_scale=1, train_wall=95, gb_free=19, wall=34544 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 191 / 1689 loss=4.229, nll_loss=2.614, ppl=6.12, wps=462008, ups=1.07, wpb=430869, bsz=16367.4, num_updates=35600, lr=0.000335201, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=34638 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 291 / 1689 loss=4.247, nll_loss=2.634, ppl=6.21, wps=457064, ups=1.05, wpb=433306, bsz=16517, num_updates=35700, lr=0.000334731, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34732 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 391 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=458910, ups=1.06, wpb=434244, bsz=16614.7, num_updates=35800, lr=0.000334263, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=34827 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 491 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=455169, ups=1.05, wpb=434275, bsz=16572.5, num_updates=35900, lr=0.000333797, gnorm=0.211, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=34922 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 epoch 022: 591 / 1689 loss=4.239, nll_loss=2.626, ppl=6.17, wps=457430, ups=1.05, wpb=434087, bsz=16595.8, num_updates=36000, lr=0.000333333, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=35017 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.253 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 691 / 1689 loss=4.235, nll_loss=2.621, ppl=6.15, wps=403759, ups=0.93, wpb=435016, bsz=16543.1, num_updates=36100, lr=0.000332871, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=35125 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 791 / 1689 loss=4.249, nll_loss=2.637, ppl=6.22, wps=461479, ups=1.07, wpb=433170, bsz=16262.5, num_updates=36200, lr=0.000332411, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=35219 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 891 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=457648, ups=1.05, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=21.2, wall=35314 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 991 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=461186, ups=1.06, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=35408 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1091 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=460887, ups=1.06, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.214, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=35503 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1192 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=455951, ups=1.05, wpb=434006, bsz=16405.4, num_updates=36600, lr=0.00033059, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=35598 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1292 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=457158, ups=1.06, wpb=431764, bsz=16550.2, num_updates=36700, lr=0.000330139, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=35692 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1393 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=456512, ups=1.05, wpb=434814, bsz=16786.4, num_updates=36800, lr=0.00032969, gnorm=0.226, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=35788 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1493 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=455798, ups=1.05, wpb=433584, bsz=16313.5, num_updates=36900, lr=0.000329243, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35883 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 epoch 022: 1593 / 1689 loss=4.261, nll_loss=2.652, ppl=6.28, wps=459127, ups=1.06, wpb=433216, bsz=16551.9, num_updates=37000, lr=0.000328798, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=35977 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 epoch 022 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.253 end of epoch 22 (average epoch stats below) epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 epoch 022 | loss 4.243 | nll_loss 2.631 | ppl 6.19 | wps 450571 | ups 1.04 | wpb 433528 | bsz 16506.2 | num_updates 37096 | lr 0.000328372 | gnorm 0.223 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.8 | wall 36079 Start iterating over samples epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 4 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=401641, ups=0.94, wpb=428456, bsz=16397.8, num_updates=37100, lr=0.000328355, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=17.2, wall=36084 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 104 / 1689 loss=4.23, nll_loss=2.615, ppl=6.13, wps=463132, ups=1.07, wpb=434357, bsz=16561.2, num_updates=37200, lr=0.000327913, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=36178 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 204 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=456488, ups=1.05, wpb=434190, bsz=17055.4, num_updates=37300, lr=0.000327473, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=36273 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 304 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=458499, ups=1.06, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=36368 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 404 / 1689 loss=4.241, nll_loss=2.628, ppl=6.18, wps=458839, ups=1.06, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=36462 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 504 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=459048, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.234, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=36557 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 604 / 1689 loss=4.231, nll_loss=2.617, ppl=6.13, wps=459579, ups=1.06, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=36651 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 704 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=457004, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.238, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=36746 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 805 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=454590, ups=1.05, wpb=432395, bsz=16588.1, num_updates=37900, lr=0.000324871, gnorm=0.223, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=36841 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 epoch 023: 905 / 1689 loss=4.246, nll_loss=2.634, ppl=6.21, wps=462204, ups=1.06, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=19.8, wall=36935 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.619 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.253 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1005 / 1689 loss=4.236, nll_loss=2.623, ppl=6.16, wps=122192, ups=0.28, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.237, clip=0, loss_scale=2, train_wall=274, gb_free=19.6, wall=37288 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1105 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=471470, ups=1.08, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.225, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=37380 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1205 / 1689 loss=4.238, nll_loss=2.626, ppl=6.17, wps=467472, ups=1.08, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=37473 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1306 / 1689 loss=4.247, nll_loss=2.636, ppl=6.22, wps=464529, ups=1.07, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=37567 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1406 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=460711, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37661 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1506 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=462044, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=37755 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 epoch 023: 1606 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=460174, ups=1.06, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.233, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=37849 end of epoch 23 (average epoch stats below) epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 epoch 023 | loss 4.237 | nll_loss 2.624 | ppl 6.17 | wps 395972 | ups 0.91 | wpb 433538 | bsz 16506.8 | num_updates 38783 | lr 0.000321151 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1747 | gb_free 21.9 | wall 37926 Start iterating over samples epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 18 / 1689 loss=4.242, nll_loss=2.63, ppl=6.19, wps=447965, ups=1.04, wpb=430057, bsz=16066.7, num_updates=38800, lr=0.000321081, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=37945 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 118 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=460875, ups=1.06, wpb=433852, bsz=16766.2, num_updates=38900, lr=0.000320668, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=38039 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 epoch 024: 218 / 1689 loss=4.218, nll_loss=2.603, ppl=6.07, wps=456614, ups=1.06, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=38133 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.257 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.253 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 318 / 1689 loss=4.226, nll_loss=2.611, ppl=6.11, wps=398409, ups=0.92, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=38242 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 418 / 1689 loss=4.231, nll_loss=2.617, ppl=6.14, wps=460840, ups=1.06, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=38337 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 518 / 1689 loss=4.227, nll_loss=2.612, ppl=6.11, wps=459352, ups=1.06, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38431 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 618 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461329, ups=1.06, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=38526 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 718 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=459028, ups=1.06, wpb=434820, bsz=16193.7, num_updates=39500, lr=0.000318223, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=38620 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 818 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=460534, ups=1.06, wpb=432525, bsz=16289.6, num_updates=39600, lr=0.000317821, gnorm=0.227, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=38714 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 918 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=458006, ups=1.05, wpb=434320, bsz=16624.6, num_updates=39700, lr=0.00031742, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=38809 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1018 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=458754, ups=1.06, wpb=433091, bsz=16553.4, num_updates=39800, lr=0.000317021, gnorm=0.23, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=38903 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1119 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=450859, ups=1.04, wpb=434448, bsz=17107.4, num_updates=39900, lr=0.000316624, gnorm=0.229, clip=0, loss_scale=2, train_wall=95, gb_free=19.1, wall=39000 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 epoch 024: 1220 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=456981, ups=1.05, wpb=434779, bsz=16586.2, num_updates=40000, lr=0.000316228, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=39095 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024 | valid on 'valid' subset | loss 4.254 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.253 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1320 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=396979, ups=0.92, wpb=431641, bsz=16486.1, num_updates=40100, lr=0.000315833, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=17, wall=39204 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1420 / 1689 loss=4.235, nll_loss=2.622, ppl=6.16, wps=460554, ups=1.06, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=39298 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1520 / 1689 loss=4.24, nll_loss=2.628, ppl=6.18, wps=459449, ups=1.06, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=39392 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 epoch 024: 1620 / 1689 loss=4.242, nll_loss=2.631, ppl=6.19, wps=459746, ups=1.06, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=39487 end of epoch 24 (average epoch stats below) epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 epoch 024 | loss 4.231 | nll_loss 2.618 | ppl 6.14 | wps 449860 | ups 1.04 | wpb 433553 | bsz 16503.8 | num_updates 40469 | lr 0.00031439 | gnorm 0.223 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.7 | wall 39550 Start iterating over samples epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 31 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=461449, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.22, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=39580 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 131 / 1689 loss=4.207, nll_loss=2.59, ppl=6.02, wps=454637, ups=1.05, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=39675 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 232 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=454827, ups=1.04, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=39771 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 332 / 1689 loss=4.225, nll_loss=2.61, ppl=6.11, wps=461627, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=39865 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 432 / 1689 loss=4.222, nll_loss=2.607, ppl=6.09, wps=458459, ups=1.06, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39959 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 epoch 025: 532 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460809, ups=1.07, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.212, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=40053 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025 | valid on 'valid' subset | loss 4.257 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.253 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 632 / 1689 loss=4.232, nll_loss=2.619, ppl=6.14, wps=403792, ups=0.93, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=40161 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 732 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458601, ups=1.06, wpb=433514, bsz=16729.7, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=40255 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 832 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=462177, ups=1.07, wpb=433601, bsz=16431.4, num_updates=41300, lr=0.000311211, gnorm=0.214, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=40349 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 932 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=458232, ups=1.06, wpb=433190, bsz=16801.8, num_updates=41400, lr=0.000310835, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=40444 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1032 / 1689 loss=4.237, nll_loss=2.624, ppl=6.17, wps=459967, ups=1.06, wpb=434707, bsz=16306.8, num_updates=41500, lr=0.00031046, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=40538 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1132 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=460316, ups=1.06, wpb=432751, bsz=16194.4, num_updates=41600, lr=0.000310087, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=40632 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1233 / 1689 loss=4.229, nll_loss=2.615, ppl=6.13, wps=458366, ups=1.05, wpb=434973, bsz=16379.8, num_updates=41700, lr=0.000309715, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=40727 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1333 / 1689 loss=4.234, nll_loss=2.621, ppl=6.15, wps=457546, ups=1.05, wpb=435590, bsz=16672.7, num_updates=41800, lr=0.000309344, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=16, wall=40822 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1433 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=461051, ups=1.06, wpb=434587, bsz=16406.1, num_updates=41900, lr=0.000308975, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=40916 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 epoch 025: 1533 / 1689 loss=4.234, nll_loss=2.622, ppl=6.15, wps=457975, ups=1.06, wpb=433836, bsz=16710.3, num_updates=42000, lr=0.000308607, gnorm=0.223, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41011 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025 | valid on 'valid' subset | loss 4.25 | nll_loss 2.605 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.25 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 epoch 025: 1633 / 1689 loss=4.225, nll_loss=2.611, ppl=6.11, wps=382274, ups=0.89, wpb=431543, bsz=16411.6, num_updates=42100, lr=0.00030824, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=41124 end of epoch 25 (average epoch stats below) epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 epoch 025 | loss 4.226 | nll_loss 2.612 | ppl 6.11 | wps 450014 | ups 1.04 | wpb 433535 | bsz 16504.6 | num_updates 42156 | lr 0.000308035 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1565 | gb_free 19.1 | wall 41176 Start iterating over samples epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 44 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=460328, ups=1.07, wpb=431578, bsz=16497.4, num_updates=42200, lr=0.000307875, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=41218 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 144 / 1689 loss=4.209, nll_loss=2.592, ppl=6.03, wps=462009, ups=1.06, wpb=434832, bsz=16435.6, num_updates=42300, lr=0.00030751, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=41312 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 244 / 1689 loss=4.219, nll_loss=2.604, ppl=6.08, wps=463034, ups=1.07, wpb=434318, bsz=16828.5, num_updates=42400, lr=0.000307148, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=41406 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 344 / 1689 loss=4.22, nll_loss=2.606, ppl=6.09, wps=462628, ups=1.06, wpb=435632, bsz=16589.4, num_updates=42500, lr=0.000306786, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=41500 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 444 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=459369, ups=1.06, wpb=433543, bsz=16249.4, num_updates=42600, lr=0.000306426, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=41594 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 545 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=452712, ups=1.05, wpb=431630, bsz=16672.1, num_updates=42700, lr=0.000306067, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=17.9, wall=41690 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 645 / 1689 loss=4.211, nll_loss=2.595, ppl=6.04, wps=459207, ups=1.06, wpb=433371, bsz=16334.2, num_updates=42800, lr=0.000305709, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=41784 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 745 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=457817, ups=1.05, wpb=435600, bsz=16531.8, num_updates=42900, lr=0.000305352, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=41879 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 epoch 026: 845 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=458571, ups=1.06, wpb=432695, bsz=16498.6, num_updates=43000, lr=0.000304997, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41973 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026 | valid on 'valid' subset | loss 4.253 | nll_loss 2.615 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.25 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 945 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=408960, ups=0.94, wpb=435404, bsz=16445, num_updates=43100, lr=0.000304643, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=42080 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1045 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=458660, ups=1.06, wpb=431425, bsz=16351.7, num_updates=43200, lr=0.00030429, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=42174 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1146 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=457785, ups=1.06, wpb=432170, bsz=16933.6, num_updates=43300, lr=0.000303939, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=16.4, wall=42268 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1246 / 1689 loss=4.221, nll_loss=2.607, ppl=6.09, wps=461620, ups=1.06, wpb=433462, bsz=16241.4, num_updates=43400, lr=0.000303588, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=42362 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1346 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=454688, ups=1.05, wpb=432424, bsz=16458.2, num_updates=43500, lr=0.000303239, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=42457 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1446 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=459679, ups=1.06, wpb=435322, bsz=16713.3, num_updates=43600, lr=0.000302891, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=42552 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1546 / 1689 loss=4.226, nll_loss=2.612, ppl=6.12, wps=460666, ups=1.06, wpb=432886, bsz=16430.4, num_updates=43700, lr=0.000302545, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=42646 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 epoch 026: 1647 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=454191, ups=1.05, wpb=433079, bsz=16347.4, num_updates=43800, lr=0.000302199, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=42741 end of epoch 26 (average epoch stats below) epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 epoch 026 | loss 4.22 | nll_loss 2.606 | ppl 6.09 | wps 455407 | ups 1.05 | wpb 433529 | bsz 16505.8 | num_updates 43842 | lr 0.000302054 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 18.9 | wall 42781 Start iterating over samples epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 58 / 1689 loss=4.208, nll_loss=2.592, ppl=6.03, wps=456349, ups=1.06, wpb=430738, bsz=16103.1, num_updates=43900, lr=0.000301855, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=42836 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 epoch 027: 158 / 1689 loss=4.214, nll_loss=2.598, ppl=6.05, wps=460595, ups=1.06, wpb=433877, bsz=16713.2, num_updates=44000, lr=0.000301511, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=42930 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027 | valid on 'valid' subset | loss 4.249 | nll_loss 2.608 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.249 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 258 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=122382, ups=0.28, wpb=433868, bsz=16091, num_updates=44100, lr=0.000301169, gnorm=0.215, clip=0, loss_scale=1, train_wall=141, gb_free=18.3, wall=43285 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 358 / 1689 loss=4.204, nll_loss=2.587, ppl=6.01, wps=468448, ups=1.08, wpb=434357, bsz=16428, num_updates=44200, lr=0.000300828, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=43377 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 458 / 1689 loss=4.21, nll_loss=2.594, ppl=6.04, wps=466424, ups=1.08, wpb=432642, bsz=16404.6, num_updates=44300, lr=0.000300489, gnorm=0.229, clip=0, loss_scale=2, train_wall=92, gb_free=18.2, wall=43470 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 558 / 1689 loss=4.208, nll_loss=2.591, ppl=6.03, wps=465797, ups=1.07, wpb=433306, bsz=16401.4, num_updates=44400, lr=0.00030015, gnorm=0.205, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=43563 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 658 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461218, ups=1.06, wpb=435113, bsz=16439.8, num_updates=44500, lr=0.000299813, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=43657 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 759 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458931, ups=1.06, wpb=434363, bsz=16611.8, num_updates=44600, lr=0.000299476, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=43752 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 859 / 1689 loss=4.228, nll_loss=2.614, ppl=6.12, wps=461851, ups=1.06, wpb=435628, bsz=16706.8, num_updates=44700, lr=0.000299141, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=43846 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 959 / 1689 loss=4.207, nll_loss=2.591, ppl=6.02, wps=455996, ups=1.06, wpb=431603, bsz=16324.5, num_updates=44800, lr=0.000298807, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=43941 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1059 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=456740, ups=1.05, wpb=433707, bsz=16829.2, num_updates=44900, lr=0.000298474, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=44036 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 epoch 027: 1159 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=457455, ups=1.05, wpb=434218, bsz=16940, num_updates=45000, lr=0.000298142, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=44131 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027 | valid on 'valid' subset | loss 4.239 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.239 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1259 / 1689 loss=4.215, nll_loss=2.601, ppl=6.07, wps=307958, ups=0.71, wpb=436413, bsz=16455.6, num_updates=45100, lr=0.000297812, gnorm=0.209, clip=0, loss_scale=1, train_wall=115, gb_free=19.5, wall=44273 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1360 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=457392, ups=1.06, wpb=433001, bsz=16411.9, num_updates=45200, lr=0.000297482, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44367 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1460 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=456759, ups=1.06, wpb=431825, bsz=16529, num_updates=45300, lr=0.000297154, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=44462 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1560 / 1689 loss=4.221, nll_loss=2.608, ppl=6.09, wps=457952, ups=1.06, wpb=432210, bsz=16408.3, num_updates=45400, lr=0.000296826, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=44556 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 epoch 027: 1660 / 1689 loss=4.228, nll_loss=2.616, ppl=6.13, wps=459910, ups=1.06, wpb=432973, bsz=16660.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=44650 end of epoch 27 (average epoch stats below) epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 epoch 027 | loss 4.216 | nll_loss 2.601 | ppl 6.07 | wps 385645 | ups 0.89 | wpb 433516 | bsz 16504.1 | num_updates 45529 | lr 0.000296405 | gnorm 0.221 | clip 0 | loss_scale 1 | train_wall 1638 | gb_free 20.3 | wall 44677 Start iterating over samples epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 71 / 1689 loss=4.209, nll_loss=2.593, ppl=6.03, wps=457235, ups=1.06, wpb=431090, bsz=16531.7, num_updates=45600, lr=0.000296174, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=44745 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 171 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=458169, ups=1.06, wpb=434035, bsz=16575.5, num_updates=45700, lr=0.00029585, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=44839 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 271 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459098, ups=1.06, wpb=433320, bsz=16397.9, num_updates=45800, lr=0.000295527, gnorm=0.222, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=44934 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 371 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=460171, ups=1.06, wpb=433657, bsz=16621.9, num_updates=45900, lr=0.000295205, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=45028 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 epoch 028: 471 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=456116, ups=1.05, wpb=433458, bsz=16666.1, num_updates=46000, lr=0.000294884, gnorm=0.221, clip=0, loss_scale=2, train_wall=94, gb_free=20.3, wall=45123 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.244 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.239 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 572 / 1689 loss=4.213, nll_loss=2.597, ppl=6.05, wps=404301, ups=0.93, wpb=433038, bsz=16883.7, num_updates=46100, lr=0.000294564, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=45230 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 672 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460142, ups=1.06, wpb=435236, bsz=16489, num_updates=46200, lr=0.000294245, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=45325 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 772 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=462754, ups=1.07, wpb=432520, bsz=16495, num_updates=46300, lr=0.000293927, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=45418 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 872 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=455197, ups=1.06, wpb=431210, bsz=16367.2, num_updates=46400, lr=0.00029361, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=45513 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 972 / 1689 loss=4.222, nll_loss=2.608, ppl=6.09, wps=456588, ups=1.05, wpb=434933, bsz=16478.1, num_updates=46500, lr=0.000293294, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=45608 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1072 / 1689 loss=4.224, nll_loss=2.61, ppl=6.1, wps=459319, ups=1.06, wpb=432840, bsz=16089.4, num_updates=46600, lr=0.000292979, gnorm=0.22, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=45702 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1172 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454237, ups=1.05, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.218, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=45798 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1272 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=462685, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.221, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=45892 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1372 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=460120, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.231, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=45986 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 epoch 028: 1473 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=453859, ups=1.05, wpb=431860, bsz=16621.9, num_updates=47000, lr=0.00029173, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=46081 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028 | valid on 'valid' subset | loss 4.239 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.239 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1573 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=372878, ups=0.86, wpb=433870, bsz=16495.4, num_updates=47100, lr=0.00029142, gnorm=0.214, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=46198 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 epoch 028: 1673 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=460583, ups=1.05, wpb=436576, bsz=16773.6, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=46293 end of epoch 28 (average epoch stats below) epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 epoch 028 | loss 4.211 | nll_loss 2.596 | ppl 6.04 | wps 448628 | ups 1.03 | wpb 433533 | bsz 16505.1 | num_updates 47216 | lr 0.000291062 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.5 | wall 46307 Start iterating over samples epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 84 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=453900, ups=1.06, wpb=430001, bsz=16353.7, num_updates=47300, lr=0.000290803, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=46387 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 184 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=456104, ups=1.05, wpb=434401, bsz=16865.9, num_updates=47400, lr=0.000290496, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46482 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 284 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=459013, ups=1.06, wpb=433188, bsz=16337.4, num_updates=47500, lr=0.000290191, gnorm=0.215, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=46577 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 384 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459550, ups=1.06, wpb=431504, bsz=16112.8, num_updates=47600, lr=0.000289886, gnorm=0.213, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=46671 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 484 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455758, ups=1.05, wpb=433523, bsz=16805.2, num_updates=47700, lr=0.000289581, gnorm=0.222, clip=0, loss_scale=2, train_wall=94, gb_free=19.8, wall=46766 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 585 / 1689 loss=4.201, nll_loss=2.584, ppl=6, wps=451019, ups=1.04, wpb=433549, bsz=16786.3, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=1, train_wall=95, gb_free=18.5, wall=46862 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 685 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=462490, ups=1.07, wpb=432807, bsz=16555.3, num_updates=47900, lr=0.000288976, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=46956 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 epoch 029: 785 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460589, ups=1.06, wpb=433342, bsz=16405.1, num_updates=48000, lr=0.000288675, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=47050 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029 | valid on 'valid' subset | loss 4.256 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.239 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 885 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=400826, ups=0.92, wpb=433749, bsz=16525.2, num_updates=48100, lr=0.000288375, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=47158 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 985 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460139, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=47252 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1085 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=462653, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=47346 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1185 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=460897, ups=1.06, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=47440 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1285 / 1689 loss=4.214, nll_loss=2.6, ppl=6.06, wps=464733, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.218, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=47533 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1386 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=456147, ups=1.05, wpb=434091, bsz=16519.7, num_updates=48600, lr=0.000286888, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=47629 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1486 / 1689 loss=4.213, nll_loss=2.599, ppl=6.06, wps=463359, ups=1.06, wpb=435377, bsz=16337.7, num_updates=48700, lr=0.000286593, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=47723 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1586 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=458790, ups=1.06, wpb=433443, bsz=16240.8, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47817 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 epoch 029: 1686 / 1689 loss=4.21, nll_loss=2.595, ppl=6.04, wps=462717, ups=1.06, wpb=436745, bsz=16359.8, num_updates=48900, lr=0.000286006, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=47911 end of epoch 29 (average epoch stats below) epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 epoch 029 | loss 4.207 | nll_loss 2.591 | ppl 6.02 | wps 455357 | ups 1.05 | wpb 433533 | bsz 16508.2 | num_updates 48903 | lr 0.000285998 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 22.6 | wall 47913 Start iterating over samples epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 epoch 030: 97 / 1689 loss=4.198, nll_loss=2.58, ppl=5.98, wps=457792, ups=1.06, wpb=430078, bsz=16546.7, num_updates=49000, lr=0.000285714, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=48005 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.241 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.239 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 197 / 1689 loss=4.188, nll_loss=2.569, ppl=5.93, wps=409193, ups=0.94, wpb=433492, bsz=16168.6, num_updates=49100, lr=0.000285423, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=48111 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 297 / 1689 loss=4.205, nll_loss=2.588, ppl=6.01, wps=459094, ups=1.06, wpb=434004, bsz=16236.6, num_updates=49200, lr=0.000285133, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=48206 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 397 / 1689 loss=4.2, nll_loss=2.582, ppl=5.99, wps=458003, ups=1.06, wpb=433836, bsz=16393.6, num_updates=49300, lr=0.000284844, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=48301 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 498 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=455219, ups=1.05, wpb=432453, bsz=16861, num_updates=49400, lr=0.000284555, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=48396 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 598 / 1689 loss=4.201, nll_loss=2.585, ppl=6, wps=455236, ups=1.05, wpb=432566, bsz=16514.9, num_updates=49500, lr=0.000284268, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=48491 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 698 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456854, ups=1.05, wpb=433366, bsz=16362.8, num_updates=49600, lr=0.000283981, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=48586 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 798 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=458828, ups=1.05, wpb=435050, bsz=16328.7, num_updates=49700, lr=0.000283695, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=17.6, wall=48680 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 898 / 1689 loss=4.215, nll_loss=2.6, ppl=6.06, wps=461941, ups=1.06, wpb=435309, bsz=16415.4, num_updates=49800, lr=0.00028341, gnorm=0.228, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=48775 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 999 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=451097, ups=1.04, wpb=432734, bsz=16569.4, num_updates=49900, lr=0.000283126, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=48871 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 epoch 030: 1099 / 1689 loss=4.197, nll_loss=2.58, ppl=5.98, wps=455865, ups=1.06, wpb=431462, bsz=16634.5, num_updates=50000, lr=0.000282843, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=48965 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1199 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=157615, ups=0.36, wpb=432032, bsz=16965.2, num_updates=50100, lr=0.00028256, gnorm=0.223, clip=0, loss_scale=1, train_wall=122, gb_free=18.3, wall=49239 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1299 / 1689 loss=4.218, nll_loss=2.604, ppl=6.08, wps=468490, ups=1.07, wpb=436056, bsz=16261.2, num_updates=50200, lr=0.000282279, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=49332 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1399 / 1689 loss=4.202, nll_loss=2.585, ppl=6, wps=461048, ups=1.06, wpb=433244, bsz=16648.7, num_updates=50300, lr=0.000281998, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=49426 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1499 / 1689 loss=4.206, nll_loss=2.59, ppl=6.02, wps=460777, ups=1.06, wpb=432808, bsz=16676, num_updates=50400, lr=0.000281718, gnorm=0.229, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=49520 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 epoch 030: 1600 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=457618, ups=1.05, wpb=437576, bsz=16531.4, num_updates=50500, lr=0.000281439, gnorm=0.213, clip=0, loss_scale=1, train_wall=95, gb_free=18.8, wall=49616 end of epoch 30 (average epoch stats below) epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 epoch 030 | loss 4.202 | nll_loss 2.586 | ppl 6 | wps 409461 | ups 0.94 | wpb 433534 | bsz 16506.2 | num_updates 50589 | lr 0.000281191 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1597 | gb_free 20.6 | wall 49699 Start iterating over samples epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 11 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=459015, ups=1.07, wpb=430286, bsz=16395.7, num_updates=50600, lr=0.000281161, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=49710 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 111 / 1689 loss=4.179, nll_loss=2.559, ppl=5.89, wps=459510, ups=1.06, wpb=431905, bsz=16454.5, num_updates=50700, lr=0.000280883, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=49804 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 211 / 1689 loss=4.187, nll_loss=2.568, ppl=5.93, wps=460707, ups=1.06, wpb=433736, bsz=16736.8, num_updates=50800, lr=0.000280607, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=49898 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 311 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=465424, ups=1.07, wpb=434465, bsz=16244.1, num_updates=50900, lr=0.000280331, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=49991 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 epoch 031: 411 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461634, ups=1.07, wpb=432591, bsz=16408.1, num_updates=51000, lr=0.000280056, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=50085 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.239 | nll_loss 2.596 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.239 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 512 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=373362, ups=0.86, wpb=435072, bsz=16735.3, num_updates=51100, lr=0.000279782, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=50201 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 612 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=457978, ups=1.06, wpb=433838, bsz=16533.9, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=17, wall=50296 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 712 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460235, ups=1.06, wpb=435127, bsz=16609.8, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=50391 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 812 / 1689 loss=4.211, nll_loss=2.596, ppl=6.05, wps=465131, ups=1.07, wpb=435701, bsz=16174.7, num_updates=51400, lr=0.000278964, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=50484 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 912 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459532, ups=1.06, wpb=435444, bsz=16435.8, num_updates=51500, lr=0.000278693, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=50579 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1012 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=454492, ups=1.06, wpb=430510, bsz=16735.7, num_updates=51600, lr=0.000278423, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=50674 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1112 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=459169, ups=1.06, wpb=432986, bsz=16050.6, num_updates=51700, lr=0.000278154, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=18.3, wall=50768 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1212 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461102, ups=1.06, wpb=434211, bsz=16676.5, num_updates=51800, lr=0.000277885, gnorm=0.215, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=50862 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1312 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=462108, ups=1.07, wpb=432316, bsz=16311.8, num_updates=51900, lr=0.000277617, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=17.2, wall=50956 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 epoch 031: 1413 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=454787, ups=1.05, wpb=434534, bsz=16850.2, num_updates=52000, lr=0.00027735, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=51051 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031 | valid on 'valid' subset | loss 4.254 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.239 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1513 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=407211, ups=0.94, wpb=433848, bsz=16958, num_updates=52100, lr=0.000277084, gnorm=0.213, clip=0, loss_scale=1, train_wall=92, gb_free=17, wall=51158 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 epoch 031: 1613 / 1689 loss=4.214, nll_loss=2.599, ppl=6.06, wps=462096, ups=1.06, wpb=434278, bsz=16246.2, num_updates=52200, lr=0.000276818, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.9, wall=51252 end of epoch 31 (average epoch stats below) epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 epoch 031 | loss 4.198 | nll_loss 2.581 | ppl 5.99 | wps 450261 | ups 1.04 | wpb 433512 | bsz 16505.2 | num_updates 52276 | lr 0.000276617 | gnorm 0.219 | clip 0 | loss_scale 1 | train_wall 1562 | gb_free 20.3 | wall 51323 Start iterating over samples epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 24 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=453545, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51346 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 124 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=458928, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=51441 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 224 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=456162, ups=1.05, wpb=433690, bsz=16513.8, num_updates=52500, lr=0.000276026, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=51536 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 324 / 1689 loss=4.195, nll_loss=2.577, ppl=5.97, wps=456227, ups=1.06, wpb=432344, bsz=16461.6, num_updates=52600, lr=0.000275764, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=16, wall=51631 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 424 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=461659, ups=1.06, wpb=435274, bsz=16641.7, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=51725 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 525 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=449621, ups=1.04, wpb=433911, bsz=16734.2, num_updates=52800, lr=0.000275241, gnorm=0.216, clip=0, loss_scale=1, train_wall=95, gb_free=18, wall=51821 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 625 / 1689 loss=4.192, nll_loss=2.574, ppl=5.96, wps=456245, ups=1.05, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=51916 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 epoch 032: 725 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457932, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=52011 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032 | valid on 'valid' subset | loss 4.235 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.235 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 825 / 1689 loss=4.2, nll_loss=2.583, ppl=5.99, wps=378951, ups=0.87, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=52126 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 925 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=459727, ups=1.06, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=52220 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1026 / 1689 loss=4.191, nll_loss=2.573, ppl=5.95, wps=458448, ups=1.06, wpb=432903, bsz=16120.9, num_updates=53300, lr=0.000273947, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=52315 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1126 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=463308, ups=1.06, wpb=435495, bsz=16461.9, num_updates=53400, lr=0.00027369, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=52409 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1226 / 1689 loss=4.202, nll_loss=2.586, ppl=6, wps=460630, ups=1.06, wpb=433453, bsz=16221.3, num_updates=53500, lr=0.000273434, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=52503 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1326 / 1689 loss=4.191, nll_loss=2.574, ppl=5.95, wps=457363, ups=1.06, wpb=432503, bsz=16258.2, num_updates=53600, lr=0.000273179, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=52597 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1426 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=456739, ups=1.05, wpb=433371, bsz=16792.2, num_updates=53700, lr=0.000272925, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=52692 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1526 / 1689 loss=4.199, nll_loss=2.584, ppl=5.99, wps=461089, ups=1.06, wpb=435592, bsz=16658, num_updates=53800, lr=0.000272671, gnorm=0.209, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=52787 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 epoch 032: 1627 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=450811, ups=1.04, wpb=432741, bsz=16583.9, num_updates=53900, lr=0.000272418, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=52883 end of epoch 32 (average epoch stats below) epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 epoch 032 | loss 4.194 | nll_loss 2.577 | ppl 5.97 | wps 451836 | ups 1.04 | wpb 433514 | bsz 16504.2 | num_updates 53962 | lr 0.000272261 | gnorm 0.216 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 21.7 | wall 52941 Start iterating over samples epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 epoch 033: 38 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=457850, ups=1.06, wpb=431267, bsz=16401.4, num_updates=54000, lr=0.000272166, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=52977 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.6 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.235 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 138 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=404603, ups=0.94, wpb=432470, bsz=16481.5, num_updates=54100, lr=0.000271914, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=53084 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 239 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=455111, ups=1.05, wpb=433078, bsz=16857, num_updates=54200, lr=0.000271663, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=53179 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 339 / 1689 loss=4.176, nll_loss=2.556, ppl=5.88, wps=460937, ups=1.06, wpb=434174, bsz=16204.9, num_updates=54300, lr=0.000271413, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=53273 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 439 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=459784, ups=1.06, wpb=432636, bsz=16393.7, num_updates=54400, lr=0.000271163, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53367 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 539 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=465708, ups=1.07, wpb=434897, bsz=16496.8, num_updates=54500, lr=0.000270914, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=53461 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 639 / 1689 loss=4.194, nll_loss=2.576, ppl=5.96, wps=459780, ups=1.06, wpb=433254, bsz=16845.6, num_updates=54600, lr=0.000270666, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=53555 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 739 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459337, ups=1.06, wpb=432983, bsz=16427.4, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=53649 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 839 / 1689 loss=4.193, nll_loss=2.576, ppl=5.96, wps=462192, ups=1.06, wpb=434313, bsz=16718.2, num_updates=54800, lr=0.000270172, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=53743 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 939 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=458891, ups=1.06, wpb=433370, bsz=15975, num_updates=54900, lr=0.000269925, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=53837 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 epoch 033: 1040 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=450950, ups=1.04, wpb=433088, bsz=16552.6, num_updates=55000, lr=0.00026968, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=95, gb_free=21.1, wall=53933 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.235 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1140 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=411187, ups=0.95, wpb=434388, bsz=16480.6, num_updates=55100, lr=0.000269435, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54039 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1240 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=457731, ups=1.05, wpb=434065, bsz=17246.1, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=18, wall=54134 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1340 / 1689 loss=4.2, nll_loss=2.584, ppl=5.99, wps=460673, ups=1.07, wpb=432338, bsz=16330.4, num_updates=55300, lr=0.000268947, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=54228 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1440 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=458854, ups=1.06, wpb=433479, bsz=16794.8, num_updates=55400, lr=0.000268705, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=54322 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1540 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=458195, ups=1.05, wpb=434682, bsz=16686.3, num_updates=55500, lr=0.000268462, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=54417 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 epoch 033: 1640 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=460368, ups=1.06, wpb=435346, bsz=16035.9, num_updates=55600, lr=0.000268221, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=54512 end of epoch 33 (average epoch stats below) epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 epoch 033 | loss 4.191 | nll_loss 2.573 | ppl 5.95 | wps 452342 | ups 1.04 | wpb 433529 | bsz 16502.8 | num_updates 55649 | lr 0.000268103 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1571 | gb_free 21.5 | wall 54557 Start iterating over samples epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 51 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455293, ups=1.06, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=54606 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 151 / 1689 loss=4.181, nll_loss=2.562, ppl=5.91, wps=459174, ups=1.06, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=54701 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 251 / 1689 loss=4.174, nll_loss=2.554, ppl=5.87, wps=465252, ups=1.08, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=16.1, wall=54794 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 epoch 034: 351 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=455926, ups=1.05, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=54889 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.227 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.227 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 451 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=383879, ups=0.89, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=55002 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 552 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=455941, ups=1.05, wpb=433224, bsz=16526.2, num_updates=56200, lr=0.000266785, gnorm=0.221, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=55097 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 652 / 1689 loss=4.192, nll_loss=2.575, ppl=5.96, wps=460414, ups=1.06, wpb=433399, bsz=16700.3, num_updates=56300, lr=0.000266548, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=55191 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 752 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=457278, ups=1.06, wpb=433341, bsz=16689.1, num_updates=56400, lr=0.000266312, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=55286 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 852 / 1689 loss=4.188, nll_loss=2.57, ppl=5.94, wps=461795, ups=1.06, wpb=433931, bsz=16428.5, num_updates=56500, lr=0.000266076, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=55380 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 952 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=462808, ups=1.07, wpb=433324, bsz=16546.3, num_updates=56600, lr=0.000265841, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=55473 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1053 / 1689 loss=4.18, nll_loss=2.561, ppl=5.9, wps=453039, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.209, clip=0, loss_scale=1, train_wall=94, gb_free=18.5, wall=55569 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1153 / 1689 loss=4.187, nll_loss=2.569, ppl=5.93, wps=457668, ups=1.06, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=55663 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1253 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=459108, ups=1.06, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55758 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 epoch 034: 1353 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=456848, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=55853 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034 | valid on 'valid' subset | loss 4.234 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.227 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1453 / 1689 loss=4.195, nll_loss=2.579, ppl=5.98, wps=401556, ups=0.92, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=55961 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1553 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=462316, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.228, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=56056 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 epoch 034: 1654 / 1689 loss=4.198, nll_loss=2.582, ppl=5.99, wps=454810, ups=1.04, wpb=435341, bsz=16284.2, num_updates=57300, lr=0.000264212, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=56151 end of epoch 34 (average epoch stats below) epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 epoch 034 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449484 | ups 1.04 | wpb 433528 | bsz 16502.3 | num_updates 57335 | lr 0.000264131 | gnorm 0.218 | clip 0 | loss_scale 1 | train_wall 1567 | gb_free 20.1 | wall 56183 Start iterating over samples epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 65 / 1689 loss=4.183, nll_loss=2.564, ppl=5.91, wps=455534, ups=1.06, wpb=430666, bsz=16368, num_updates=57400, lr=0.000263982, gnorm=0.23, clip=0, loss_scale=1, train_wall=91, gb_free=20.1, wall=56246 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 165 / 1689 loss=4.178, nll_loss=2.558, ppl=5.89, wps=457598, ups=1.06, wpb=432580, bsz=16415.4, num_updates=57500, lr=0.000263752, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=56340 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 265 / 1689 loss=4.174, nll_loss=2.555, ppl=5.88, wps=459786, ups=1.06, wpb=434017, bsz=16532, num_updates=57600, lr=0.000263523, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=56435 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 365 / 1689 loss=4.172, nll_loss=2.552, ppl=5.86, wps=454951, ups=1.05, wpb=434247, bsz=16603.9, num_updates=57700, lr=0.000263295, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=56530 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 465 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=461758, ups=1.06, wpb=435003, bsz=16453.5, num_updates=57800, lr=0.000263067, gnorm=0.213, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56624 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 565 / 1689 loss=4.175, nll_loss=2.555, ppl=5.88, wps=457928, ups=1.05, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.214, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=56719 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 epoch 035: 665 / 1689 loss=4.185, nll_loss=2.567, ppl=5.92, wps=458077, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=56814 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.231 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.227 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 766 / 1689 loss=4.188, nll_loss=2.571, ppl=5.94, wps=407410, ups=0.93, wpb=436168, bsz=16822.2, num_updates=58100, lr=0.000262387, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=56921 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 866 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=464111, ups=1.07, wpb=431742, bsz=16687.8, num_updates=58200, lr=0.000262161, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=57014 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 966 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=463198, ups=1.07, wpb=433067, bsz=16569.4, num_updates=58300, lr=0.000261936, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=18, wall=57108 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1066 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=465216, ups=1.07, wpb=434897, bsz=16263.4, num_updates=58400, lr=0.000261712, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57201 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1166 / 1689 loss=4.195, nll_loss=2.579, ppl=5.97, wps=464278, ups=1.07, wpb=432368, bsz=16292.7, num_updates=58500, lr=0.000261488, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57294 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1266 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=458455, ups=1.06, wpb=430784, bsz=16241.4, num_updates=58600, lr=0.000261265, gnorm=0.217, clip=0, loss_scale=2, train_wall=93, gb_free=18.3, wall=57388 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1367 / 1689 loss=4.181, nll_loss=2.563, ppl=5.91, wps=454700, ups=1.05, wpb=433105, bsz=16363.9, num_updates=58700, lr=0.000261042, gnorm=0.217, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=57484 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1467 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=459900, ups=1.06, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57578 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1567 / 1689 loss=4.187, nll_loss=2.569, ppl=5.94, wps=460280, ups=1.06, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=57673 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 epoch 035: 1667 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=457282, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=57767 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.227 end of epoch 35 (average epoch stats below) epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 epoch 035 | loss 4.184 | nll_loss 2.566 | ppl 5.92 | wps 434795 | ups 1 | wpb 433528 | bsz 16502.5 | num_updates 59022 | lr 0.000260329 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 21.6 | wall 57866 Start iterating over samples epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 78 / 1689 loss=4.176, nll_loss=2.557, ppl=5.89, wps=222784, ups=0.52, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.217, clip=0, loss_scale=1, train_wall=122, gb_free=18.9, wall=57961 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 178 / 1689 loss=4.168, nll_loss=2.547, ppl=5.85, wps=465853, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.218, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=58054 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 279 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=458596, ups=1.06, wpb=432476, bsz=16268, num_updates=59300, lr=0.000259718, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=18.3, wall=58148 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 379 / 1689 loss=4.171, nll_loss=2.551, ppl=5.86, wps=460827, ups=1.06, wpb=433188, bsz=16593.7, num_updates=59400, lr=0.0002595, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=16.9, wall=58242 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 479 / 1689 loss=4.175, nll_loss=2.556, ppl=5.88, wps=459761, ups=1.06, wpb=432250, bsz=16647.6, num_updates=59500, lr=0.000259281, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=58336 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 579 / 1689 loss=4.17, nll_loss=2.55, ppl=5.86, wps=458954, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=58430 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 679 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=461527, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=58524 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 779 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=461153, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=58618 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 879 / 1689 loss=4.176, nll_loss=2.557, ppl=5.88, wps=461575, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=58712 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 epoch 036: 979 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461493, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.211, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=58806 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 epoch 036 | valid on 'valid' subset | loss 4.232 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.227 end of epoch 36 (average epoch stats below) epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 epoch 036 | loss 4.174 | nll_loss 2.555 | ppl 5.87 | wps 444313 | ups 1.03 | wpb 433224 | bsz 16463.1 | num_updates 60000 | lr 0.000258199 | gnorm 0.215 | clip 0 | loss_scale 2 | train_wall 919 | gb_free 18.9 | wall 58819 done training in 58806.8 seconds