{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja.do03.ado00/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:10676', 'distributed_port': 10676, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja.do03.ado00', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja.do03.ado00/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja.do03.ado00', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.3, attention_dropout=0.0, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=13.132, nll_loss=12.826, ppl=7261.28, wps=457377, ups=1.05, wpb=435063, bsz=16693.8, num_updates=100, lr=2.5e-05, gnorm=2.894, clip=86, loss_scale=4, train_wall=100, gb_free=19, wall=117 epoch 001: 202 / 1689 loss=11.765, nll_loss=11.274, ppl=2477.19, wps=457089, ups=1.05, wpb=435246, bsz=16971.5, num_updates=200, lr=5e-05, gnorm=2.076, clip=96, loss_scale=2, train_wall=95, gb_free=19.7, wall=212 epoch 001: 302 / 1689 loss=11.268, nll_loss=10.686, ppl=1646.99, wps=459696, ups=1.06, wpb=434886, bsz=16709.5, num_updates=300, lr=7.5e-05, gnorm=1.674, clip=93, loss_scale=2, train_wall=94, gb_free=18.9, wall=307 epoch 001: 402 / 1689 loss=10.652, nll_loss=9.952, ppl=990.63, wps=461480, ups=1.07, wpb=432640, bsz=16420.4, num_updates=400, lr=0.0001, gnorm=1.51, clip=95, loss_scale=2, train_wall=93, gb_free=16.6, wall=400 epoch 001: 502 / 1689 loss=10.105, nll_loss=9.297, ppl=629.11, wps=459724, ups=1.06, wpb=433247, bsz=16495.6, num_updates=500, lr=0.000125, gnorm=1.391, clip=94, loss_scale=2, train_wall=94, gb_free=19.3, wall=495 epoch 001: 602 / 1689 loss=9.685, nll_loss=8.796, ppl=444.53, wps=457423, ups=1.06, wpb=433091, bsz=16520.5, num_updates=600, lr=0.00015, gnorm=1.3, clip=91, loss_scale=2, train_wall=94, gb_free=18.7, wall=589 epoch 001: 702 / 1689 loss=9.336, nll_loss=8.381, ppl=333.41, wps=460279, ups=1.06, wpb=434503, bsz=16405.3, num_updates=700, lr=0.000175, gnorm=1.193, clip=81, loss_scale=4, train_wall=93, gb_free=19.4, wall=684 epoch 001: 802 / 1689 loss=9.005, nll_loss=7.993, ppl=254.75, wps=459259, ups=1.06, wpb=434941, bsz=16333.4, num_updates=800, lr=0.0002, gnorm=1.052, clip=62, loss_scale=4, train_wall=93, gb_free=21.3, wall=778 epoch 001: 902 / 1689 loss=8.673, nll_loss=7.607, ppl=194.99, wps=457520, ups=1.05, wpb=434996, bsz=16571.2, num_updates=900, lr=0.000225, gnorm=1.016, clip=52, loss_scale=4, train_wall=93, gb_free=18.8, wall=874 epoch 001: 1002 / 1689 loss=8.367, nll_loss=7.251, ppl=152.34, wps=450968, ups=1.04, wpb=431938, bsz=16755.5, num_updates=1000, lr=0.00025, gnorm=0.942, clip=34, loss_scale=4, train_wall=94, gb_free=19.7, wall=969 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 8.092 | nll_loss 6.874 | ppl 117.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 1000 epoch 001: 1102 / 1689 loss=8.051, nll_loss=6.887, ppl=118.34, wps=388723, ups=0.9, wpb=430204, bsz=16536.8, num_updates=1100, lr=0.000275, gnorm=0.887, clip=27, loss_scale=4, train_wall=92, gb_free=18.6, wall=1080 epoch 001: 1202 / 1689 loss=7.756, nll_loss=6.546, ppl=93.43, wps=460388, ups=1.06, wpb=433198, bsz=16274.6, num_updates=1200, lr=0.0003, gnorm=0.88, clip=19, loss_scale=8, train_wall=93, gb_free=20.5, wall=1174 epoch 001: 1302 / 1689 loss=7.399, nll_loss=6.135, ppl=70.26, wps=457880, ups=1.06, wpb=432602, bsz=16372.2, num_updates=1300, lr=0.000325, gnorm=0.851, clip=8, loss_scale=8, train_wall=93, gb_free=20.6, wall=1269 epoch 001: 1402 / 1689 loss=7.076, nll_loss=5.763, ppl=54.3, wps=459950, ups=1.06, wpb=434746, bsz=16342.9, num_updates=1400, lr=0.00035, gnorm=0.82, clip=12, loss_scale=8, train_wall=93, gb_free=19.1, wall=1363 epoch 001: 1502 / 1689 loss=6.753, nll_loss=5.393, ppl=42.01, wps=465420, ups=1.07, wpb=435243, bsz=16512.3, num_updates=1500, lr=0.000375, gnorm=0.789, clip=10, loss_scale=8, train_wall=92, gb_free=18.6, wall=1457 epoch 001: 1602 / 1689 loss=6.483, nll_loss=5.085, ppl=33.95, wps=462396, ups=1.07, wpb=433941, bsz=16442.2, num_updates=1600, lr=0.0004, gnorm=0.71, clip=3, loss_scale=8, train_wall=93, gb_free=19.2, wall=1550 end of epoch 1 (average epoch stats below) epoch 001 | loss 8.953 | nll_loss 7.954 | ppl 247.98 | wps 454038 | ups 1.05 | wpb 433517 | bsz 16506.8 | num_updates 1686 | lr 0.0004215 | gnorm 1.22 | clip 51.3 | loss_scale 8 | train_wall 1579 | gb_free 21.1 | wall 1632 Start iterating over samples epoch 002: 14 / 1689 loss=6.273, nll_loss=4.849, ppl=28.83, wps=450561, ups=1.05, wpb=428985, bsz=16298.6, num_updates=1700, lr=0.000425, gnorm=0.701, clip=5, loss_scale=8, train_wall=93, gb_free=20, wall=1646 epoch 002: 14 / 1689 loss=6.273, nll_loss=4.849, ppl=28.83, wps=450561, ups=1.05, wpb=428985, bsz=16298.6, num_updates=1700, lr=0.000425, gnorm=0.701, clip=5, loss_scale=8, train_wall=93, gb_free=20, wall=1646 epoch 002: 114 / 1689 loss=6.084, nll_loss=4.636, ppl=24.87, wps=459492, ups=1.06, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.599, clip=4, loss_scale=8, train_wall=93, gb_free=19.9, wall=1740 epoch 002: 114 / 1689 loss=6.084, nll_loss=4.636, ppl=24.87, wps=459492, ups=1.06, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.599, clip=4, loss_scale=8, train_wall=93, gb_free=19.9, wall=1740 epoch 002: 214 / 1689 loss=5.952, nll_loss=4.49, ppl=22.48, wps=458952, ups=1.06, wpb=434234, bsz=16501.8, num_updates=1900, lr=0.000475, gnorm=0.593, clip=2, loss_scale=8, train_wall=93, gb_free=19.6, wall=1835 epoch 002: 214 / 1689 loss=5.952, nll_loss=4.49, ppl=22.48, wps=458952, ups=1.06, wpb=434234, bsz=16501.8, num_updates=1900, lr=0.000475, gnorm=0.593, clip=2, loss_scale=8, train_wall=93, gb_free=19.6, wall=1835 epoch 002: 314 / 1689 loss=5.801, nll_loss=4.323, ppl=20.01, wps=461775, ups=1.06, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.553, clip=0, loss_scale=8, train_wall=92, gb_free=19.6, wall=1929 epoch 002: 314 / 1689 loss=5.801, nll_loss=4.323, ppl=20.01, wps=461775, ups=1.06, wpb=434872, bsz=16790, num_updates=2000, lr=0.0005, gnorm=0.553, clip=0, loss_scale=8, train_wall=92, gb_free=19.6, wall=1929 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.649 | nll_loss 4.038 | ppl 16.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.649 epoch 002 | valid on 'valid' subset | loss 5.649 | nll_loss 4.038 | ppl 16.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.649 epoch 002: 414 / 1689 loss=5.706, nll_loss=4.217, ppl=18.6, wps=382537, ups=0.88, wpb=432589, bsz=16224.5, num_updates=2100, lr=0.000525, gnorm=0.53, clip=2, loss_scale=8, train_wall=92, gb_free=20.5, wall=2042 epoch 002: 414 / 1689 loss=5.706, nll_loss=4.217, ppl=18.6, wps=382537, ups=0.88, wpb=432589, bsz=16224.5, num_updates=2100, lr=0.000525, gnorm=0.53, clip=2, loss_scale=8, train_wall=92, gb_free=20.5, wall=2042 epoch 002: 515 / 1689 loss=5.597, nll_loss=4.098, ppl=17.12, wps=457255, ups=1.05, wpb=434843, bsz=16526.6, num_updates=2200, lr=0.00055, gnorm=0.486, clip=0, loss_scale=8, train_wall=94, gb_free=19.5, wall=2137 epoch 002: 515 / 1689 loss=5.597, nll_loss=4.098, ppl=17.12, wps=457255, ups=1.05, wpb=434843, bsz=16526.6, num_updates=2200, lr=0.00055, gnorm=0.486, clip=0, loss_scale=8, train_wall=94, gb_free=19.5, wall=2137 epoch 002: 616 / 1689 loss=5.51, nll_loss=4.004, ppl=16.04, wps=452841, ups=1.05, wpb=432783, bsz=16645.9, num_updates=2300, lr=0.000575, gnorm=0.48, clip=0, loss_scale=4, train_wall=94, gb_free=20, wall=2233 epoch 002: 616 / 1689 loss=5.51, nll_loss=4.004, ppl=16.04, wps=452841, ups=1.05, wpb=432783, bsz=16645.9, num_updates=2300, lr=0.000575, gnorm=0.48, clip=0, loss_scale=4, train_wall=94, gb_free=20, wall=2233 epoch 002: 716 / 1689 loss=5.428, nll_loss=3.913, ppl=15.07, wps=457148, ups=1.05, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.456, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=2328 epoch 002: 716 / 1689 loss=5.428, nll_loss=3.913, ppl=15.07, wps=457148, ups=1.05, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.456, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=2328 epoch 002: 816 / 1689 loss=5.36, nll_loss=3.84, ppl=14.32, wps=459476, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.451, clip=0, loss_scale=4, train_wall=92, gb_free=19.3, wall=2422 epoch 002: 816 / 1689 loss=5.36, nll_loss=3.84, ppl=14.32, wps=459476, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.451, clip=0, loss_scale=4, train_wall=92, gb_free=19.3, wall=2422 epoch 002: 916 / 1689 loss=5.309, nll_loss=3.784, ppl=13.78, wps=460643, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.436, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=2516 epoch 002: 916 / 1689 loss=5.309, nll_loss=3.784, ppl=13.78, wps=460643, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.436, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=2516 epoch 002: 1016 / 1689 loss=5.248, nll_loss=3.719, ppl=13.17, wps=462214, ups=1.07, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.456, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=2609 epoch 002: 1016 / 1689 loss=5.248, nll_loss=3.719, ppl=13.17, wps=462214, ups=1.07, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.456, clip=0, loss_scale=4, train_wall=92, gb_free=20.2, wall=2609 epoch 002: 1116 / 1689 loss=5.207, nll_loss=3.674, ppl=12.76, wps=459171, ups=1.06, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.407, clip=0, loss_scale=8, train_wall=93, gb_free=21.7, wall=2704 epoch 002: 1116 / 1689 loss=5.207, nll_loss=3.674, ppl=12.76, wps=459171, ups=1.06, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.407, clip=0, loss_scale=8, train_wall=93, gb_free=21.7, wall=2704 epoch 002: 1217 / 1689 loss=5.159, nll_loss=3.623, ppl=12.32, wps=457283, ups=1.05, wpb=434677, bsz=16534.8, num_updates=2900, lr=0.000725, gnorm=0.429, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=2799 epoch 002: 1217 / 1689 loss=5.159, nll_loss=3.623, ppl=12.32, wps=457283, ups=1.05, wpb=434677, bsz=16534.8, num_updates=2900, lr=0.000725, gnorm=0.429, clip=0, loss_scale=4, train_wall=94, gb_free=19.9, wall=2799 epoch 002: 1317 / 1689 loss=5.124, nll_loss=3.585, ppl=12, wps=458144, ups=1.06, wpb=432763, bsz=16804.3, num_updates=3000, lr=0.00075, gnorm=0.43, clip=0, loss_scale=4, train_wall=93, gb_free=20.1, wall=2894 epoch 002: 1317 / 1689 loss=5.124, nll_loss=3.585, ppl=12, wps=458144, ups=1.06, wpb=432763, bsz=16804.3, num_updates=3000, lr=0.00075, gnorm=0.43, clip=0, loss_scale=4, train_wall=93, gb_free=20.1, wall=2894 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.973 | nll_loss 3.326 | ppl 10.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.973 epoch 002 | valid on 'valid' subset | loss 4.973 | nll_loss 3.326 | ppl 10.03 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.973 epoch 002: 1417 / 1689 loss=5.081, nll_loss=3.538, ppl=11.62, wps=372829, ups=0.86, wpb=433750, bsz=16299, num_updates=3100, lr=0.000775, gnorm=0.406, clip=0, loss_scale=4, train_wall=93, gb_free=19.9, wall=3010 epoch 002: 1417 / 1689 loss=5.081, nll_loss=3.538, ppl=11.62, wps=372829, ups=0.86, wpb=433750, bsz=16299, num_updates=3100, lr=0.000775, gnorm=0.406, clip=0, loss_scale=4, train_wall=93, gb_free=19.9, wall=3010 epoch 002: 1517 / 1689 loss=5.063, nll_loss=3.52, ppl=11.47, wps=461357, ups=1.06, wpb=433261, bsz=16440.2, num_updates=3200, lr=0.0008, gnorm=0.418, clip=0, loss_scale=4, train_wall=92, gb_free=19.3, wall=3104 epoch 002: 1517 / 1689 loss=5.063, nll_loss=3.52, ppl=11.47, wps=461357, ups=1.06, wpb=433261, bsz=16440.2, num_updates=3200, lr=0.0008, gnorm=0.418, clip=0, loss_scale=4, train_wall=92, gb_free=19.3, wall=3104 epoch 002: 1617 / 1689 loss=5.019, nll_loss=3.471, ppl=11.09, wps=459523, ups=1.06, wpb=434034, bsz=16356.6, num_updates=3300, lr=0.000825, gnorm=0.428, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=3198 epoch 002: 1617 / 1689 loss=5.019, nll_loss=3.471, ppl=11.09, wps=459523, ups=1.06, wpb=434034, bsz=16356.6, num_updates=3300, lr=0.000825, gnorm=0.428, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=3198 end of epoch 2 (average epoch stats below) epoch 002 | loss 5.405 | nll_loss 3.891 | ppl 14.84 | wps 447264 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 3372 | lr 0.000843 | gnorm 0.473 | clip 0.7 | loss_scale 4 | train_wall 1565 | gb_free 20.5 | wall 3266 epoch 002 | loss 5.405 | nll_loss 3.891 | ppl 14.84 | wps 447264 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 3372 | lr 0.000843 | gnorm 0.473 | clip 0.7 | loss_scale 4 | train_wall 1565 | gb_free 20.5 | wall 3266 Start iterating over samples epoch 003: 28 / 1689 loss=5.012, nll_loss=3.464, ppl=11.04, wps=456392, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.433, clip=0, loss_scale=8, train_wall=92, gb_free=18.1, wall=3293 epoch 003: 28 / 1689 loss=5.012, nll_loss=3.464, ppl=11.04, wps=456392, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.433, clip=0, loss_scale=8, train_wall=92, gb_free=18.1, wall=3293 epoch 003: 28 / 1689 loss=5.012, nll_loss=3.464, ppl=11.04, wps=456392, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.433, clip=0, loss_scale=8, train_wall=92, gb_free=18.1, wall=3293 epoch 003: 129 / 1689 loss=4.967, nll_loss=3.415, ppl=10.67, wps=457064, ups=1.05, wpb=434080, bsz=16765.3, num_updates=3500, lr=0.000875, gnorm=0.401, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=3388 epoch 003: 129 / 1689 loss=4.967, nll_loss=3.415, ppl=10.67, wps=457064, ups=1.05, wpb=434080, bsz=16765.3, num_updates=3500, lr=0.000875, gnorm=0.401, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=3388 epoch 003: 129 / 1689 loss=4.967, nll_loss=3.415, ppl=10.67, wps=457064, ups=1.05, wpb=434080, bsz=16765.3, num_updates=3500, lr=0.000875, gnorm=0.401, clip=0, loss_scale=4, train_wall=93, gb_free=18.4, wall=3388 epoch 003: 229 / 1689 loss=4.953, nll_loss=3.4, ppl=10.56, wps=457248, ups=1.05, wpb=433922, bsz=16906.6, num_updates=3600, lr=0.0009, gnorm=0.435, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=3483 epoch 003: 229 / 1689 loss=4.953, nll_loss=3.4, ppl=10.56, wps=457248, ups=1.05, wpb=433922, bsz=16906.6, num_updates=3600, lr=0.0009, gnorm=0.435, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=3483 epoch 003: 229 / 1689 loss=4.953, nll_loss=3.4, ppl=10.56, wps=457248, ups=1.05, wpb=433922, bsz=16906.6, num_updates=3600, lr=0.0009, gnorm=0.435, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=3483 epoch 003: 329 / 1689 loss=4.926, nll_loss=3.371, ppl=10.34, wps=453768, ups=1.05, wpb=431866, bsz=16563.3, num_updates=3700, lr=0.000925, gnorm=0.42, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=3578 epoch 003: 329 / 1689 loss=4.926, nll_loss=3.371, ppl=10.34, wps=453768, ups=1.05, wpb=431866, bsz=16563.3, num_updates=3700, lr=0.000925, gnorm=0.42, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=3578 epoch 003: 329 / 1689 loss=4.926, nll_loss=3.371, ppl=10.34, wps=453768, ups=1.05, wpb=431866, bsz=16563.3, num_updates=3700, lr=0.000925, gnorm=0.42, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=3578 epoch 003: 430 / 1689 loss=4.925, nll_loss=3.371, ppl=10.34, wps=458494, ups=1.06, wpb=433252, bsz=16402, num_updates=3800, lr=0.00095, gnorm=0.419, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=3672 epoch 003: 430 / 1689 loss=4.925, nll_loss=3.371, ppl=10.34, wps=458494, ups=1.06, wpb=433252, bsz=16402, num_updates=3800, lr=0.00095, gnorm=0.419, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=3672 epoch 003: 430 / 1689 loss=4.925, nll_loss=3.371, ppl=10.34, wps=458494, ups=1.06, wpb=433252, bsz=16402, num_updates=3800, lr=0.00095, gnorm=0.419, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=3672 epoch 003: 530 / 1689 loss=4.912, nll_loss=3.357, ppl=10.25, wps=461139, ups=1.06, wpb=434716, bsz=16543.6, num_updates=3900, lr=0.000975, gnorm=0.427, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3767 epoch 003: 530 / 1689 loss=4.912, nll_loss=3.357, ppl=10.25, wps=461139, ups=1.06, wpb=434716, bsz=16543.6, num_updates=3900, lr=0.000975, gnorm=0.427, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3767 epoch 003: 530 / 1689 loss=4.912, nll_loss=3.357, ppl=10.25, wps=461139, ups=1.06, wpb=434716, bsz=16543.6, num_updates=3900, lr=0.000975, gnorm=0.427, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=3767 epoch 003: 630 / 1689 loss=4.899, nll_loss=3.343, ppl=10.15, wps=461623, ups=1.06, wpb=434272, bsz=16474.6, num_updates=4000, lr=0.001, gnorm=0.429, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=3861 epoch 003: 630 / 1689 loss=4.899, nll_loss=3.343, ppl=10.15, wps=461623, ups=1.06, wpb=434272, bsz=16474.6, num_updates=4000, lr=0.001, gnorm=0.429, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=3861 epoch 003: 630 / 1689 loss=4.899, nll_loss=3.343, ppl=10.15, wps=461623, ups=1.06, wpb=434272, bsz=16474.6, num_updates=4000, lr=0.001, gnorm=0.429, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=3861 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.788 | nll_loss 3.149 | ppl 8.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.788 epoch 003 | valid on 'valid' subset | loss 4.788 | nll_loss 3.149 | ppl 8.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.788 epoch 003 | valid on 'valid' subset | loss 4.788 | nll_loss 3.149 | ppl 8.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.788 epoch 003: 730 / 1689 loss=4.897, nll_loss=3.342, ppl=10.14, wps=384154, ups=0.88, wpb=434443, bsz=16383.4, num_updates=4100, lr=0.00098773, gnorm=0.438, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=3974 epoch 003: 730 / 1689 loss=4.897, nll_loss=3.342, ppl=10.14, wps=384154, ups=0.88, wpb=434443, bsz=16383.4, num_updates=4100, lr=0.00098773, gnorm=0.438, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=3974 epoch 003: 730 / 1689 loss=4.897, nll_loss=3.342, ppl=10.14, wps=384154, ups=0.88, wpb=434443, bsz=16383.4, num_updates=4100, lr=0.00098773, gnorm=0.438, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=3974 epoch 003: 830 / 1689 loss=4.863, nll_loss=3.304, ppl=9.88, wps=460034, ups=1.06, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=4068 epoch 003: 830 / 1689 loss=4.863, nll_loss=3.304, ppl=9.88, wps=460034, ups=1.06, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=4068 epoch 003: 830 / 1689 loss=4.863, nll_loss=3.304, ppl=9.88, wps=460034, ups=1.06, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.413, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=4068 epoch 003: 930 / 1689 loss=4.848, nll_loss=3.288, ppl=9.77, wps=461028, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.428, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4162 epoch 003: 930 / 1689 loss=4.848, nll_loss=3.288, ppl=9.77, wps=461028, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.428, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4162 epoch 003: 930 / 1689 loss=4.848, nll_loss=3.288, ppl=9.77, wps=461028, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.428, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4162 epoch 003: 1030 / 1689 loss=4.842, nll_loss=3.282, ppl=9.73, wps=462378, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=4256 epoch 003: 1030 / 1689 loss=4.842, nll_loss=3.282, ppl=9.73, wps=462378, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=4256 epoch 003: 1030 / 1689 loss=4.842, nll_loss=3.282, ppl=9.73, wps=462378, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=4256 epoch 003: 1130 / 1689 loss=4.828, nll_loss=3.268, ppl=9.63, wps=463878, ups=1.06, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.414, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=4350 epoch 003: 1130 / 1689 loss=4.828, nll_loss=3.268, ppl=9.63, wps=463878, ups=1.06, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.414, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=4350 epoch 003: 1130 / 1689 loss=4.828, nll_loss=3.268, ppl=9.63, wps=463878, ups=1.06, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.414, clip=0, loss_scale=4, train_wall=92, gb_free=18.9, wall=4350 epoch 003: 1230 / 1689 loss=4.808, nll_loss=3.246, ppl=9.48, wps=466649, ups=1.08, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.406, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4443 epoch 003: 1230 / 1689 loss=4.808, nll_loss=3.246, ppl=9.48, wps=466649, ups=1.08, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.406, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4443 epoch 003: 1230 / 1689 loss=4.808, nll_loss=3.246, ppl=9.48, wps=466649, ups=1.08, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.406, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4443 epoch 003: 1330 / 1689 loss=4.783, nll_loss=3.218, ppl=9.3, wps=463449, ups=1.07, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.394, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=4537 epoch 003: 1330 / 1689 loss=4.783, nll_loss=3.218, ppl=9.3, wps=463449, ups=1.07, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.394, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=4537 epoch 003: 1330 / 1689 loss=4.783, nll_loss=3.218, ppl=9.3, wps=463449, ups=1.07, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.394, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=4537 epoch 003: 1430 / 1689 loss=4.784, nll_loss=3.221, ppl=9.33, wps=462874, ups=1.07, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.395, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=4630 epoch 003: 1430 / 1689 loss=4.784, nll_loss=3.221, ppl=9.33, wps=462874, ups=1.07, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.395, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=4630 epoch 003: 1430 / 1689 loss=4.784, nll_loss=3.221, ppl=9.33, wps=462874, ups=1.07, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.395, clip=0, loss_scale=4, train_wall=92, gb_free=18.7, wall=4630 epoch 003: 1531 / 1689 loss=4.755, nll_loss=3.188, ppl=9.11, wps=453376, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.392, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=4726 epoch 003: 1531 / 1689 loss=4.755, nll_loss=3.188, ppl=9.11, wps=453376, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.392, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=4726 epoch 003: 1531 / 1689 loss=4.755, nll_loss=3.188, ppl=9.11, wps=453376, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.392, clip=0, loss_scale=4, train_wall=93, gb_free=19.1, wall=4726 epoch 003: 1632 / 1689 loss=4.747, nll_loss=3.18, ppl=9.06, wps=453944, ups=1.05, wpb=431879, bsz=16155.7, num_updates=5000, lr=0.000894427, gnorm=0.402, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=4821 epoch 003: 1632 / 1689 loss=4.747, nll_loss=3.18, ppl=9.06, wps=453944, ups=1.05, wpb=431879, bsz=16155.7, num_updates=5000, lr=0.000894427, gnorm=0.402, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=4821 epoch 003: 1632 / 1689 loss=4.747, nll_loss=3.18, ppl=9.06, wps=453944, ups=1.05, wpb=431879, bsz=16155.7, num_updates=5000, lr=0.000894427, gnorm=0.402, clip=0, loss_scale=2, train_wall=94, gb_free=20.1, wall=4821 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.623 | nll_loss 2.98 | ppl 7.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.623 epoch 003 | valid on 'valid' subset | loss 4.623 | nll_loss 2.98 | ppl 7.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.623 epoch 003 | valid on 'valid' subset | loss 4.623 | nll_loss 2.98 | ppl 7.89 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.623 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.857 | nll_loss 3.298 | ppl 9.84 | wps 412144 | ups 0.95 | wpb 433530 | bsz 16498.1 | num_updates 5057 | lr 0.000889372 | gnorm 0.413 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 19.6 | wall 5038 epoch 003 | loss 4.857 | nll_loss 3.298 | ppl 9.84 | wps 412144 | ups 0.95 | wpb 433530 | bsz 16498.1 | num_updates 5057 | lr 0.000889372 | gnorm 0.413 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 19.6 | wall 5038 epoch 003 | loss 4.857 | nll_loss 3.298 | ppl 9.84 | wps 412144 | ups 0.95 | wpb 433530 | bsz 16498.1 | num_updates 5057 | lr 0.000889372 | gnorm 0.413 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 19.6 | wall 5038 Start iterating over samples epoch 004: 43 / 1689 loss=4.724, nll_loss=3.155, ppl=8.91, wps=165637, ups=0.38, wpb=430316, bsz=16812.3, num_updates=5100, lr=0.000885615, gnorm=0.389, clip=0, loss_scale=2, train_wall=148, gb_free=19.3, wall=5081 epoch 004: 43 / 1689 loss=4.724, nll_loss=3.155, ppl=8.91, wps=165637, ups=0.38, wpb=430316, bsz=16812.3, num_updates=5100, lr=0.000885615, gnorm=0.389, clip=0, loss_scale=2, train_wall=148, gb_free=19.3, wall=5081 epoch 004: 43 / 1689 loss=4.724, nll_loss=3.155, ppl=8.91, wps=165637, ups=0.38, wpb=430316, bsz=16812.3, num_updates=5100, lr=0.000885615, gnorm=0.389, clip=0, loss_scale=2, train_wall=148, gb_free=19.3, wall=5081 epoch 004: 43 / 1689 loss=4.724, nll_loss=3.155, ppl=8.91, wps=165637, ups=0.38, wpb=430316, bsz=16812.3, num_updates=5100, lr=0.000885615, gnorm=0.389, clip=0, loss_scale=2, train_wall=148, gb_free=19.3, wall=5081 epoch 004: 143 / 1689 loss=4.702, nll_loss=3.129, ppl=8.75, wps=472650, ups=1.09, wpb=433478, bsz=16365.4, num_updates=5200, lr=0.000877058, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=5172 epoch 004: 143 / 1689 loss=4.702, nll_loss=3.129, ppl=8.75, wps=472650, ups=1.09, wpb=433478, bsz=16365.4, num_updates=5200, lr=0.000877058, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=5172 epoch 004: 143 / 1689 loss=4.702, nll_loss=3.129, ppl=8.75, wps=472650, ups=1.09, wpb=433478, bsz=16365.4, num_updates=5200, lr=0.000877058, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=5172 epoch 004: 143 / 1689 loss=4.702, nll_loss=3.129, ppl=8.75, wps=472650, ups=1.09, wpb=433478, bsz=16365.4, num_updates=5200, lr=0.000877058, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=5172 epoch 004: 243 / 1689 loss=4.697, nll_loss=3.125, ppl=8.72, wps=461295, ups=1.07, wpb=431922, bsz=16305.1, num_updates=5300, lr=0.000868744, gnorm=0.389, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=5266 epoch 004: 243 / 1689 loss=4.697, nll_loss=3.125, ppl=8.72, wps=461295, ups=1.07, wpb=431922, bsz=16305.1, num_updates=5300, lr=0.000868744, gnorm=0.389, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=5266 epoch 004: 243 / 1689 loss=4.697, nll_loss=3.125, ppl=8.72, wps=461295, ups=1.07, wpb=431922, bsz=16305.1, num_updates=5300, lr=0.000868744, gnorm=0.389, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=5266 epoch 004: 243 / 1689 loss=4.697, nll_loss=3.125, ppl=8.72, wps=461295, ups=1.07, wpb=431922, bsz=16305.1, num_updates=5300, lr=0.000868744, gnorm=0.389, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=5266 epoch 004: 343 / 1689 loss=4.696, nll_loss=3.124, ppl=8.72, wps=465358, ups=1.07, wpb=434595, bsz=16711.5, num_updates=5400, lr=0.000860663, gnorm=0.38, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=5359 epoch 004: 343 / 1689 loss=4.696, nll_loss=3.124, ppl=8.72, wps=465358, ups=1.07, wpb=434595, bsz=16711.5, num_updates=5400, lr=0.000860663, gnorm=0.38, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=5359 epoch 004: 343 / 1689 loss=4.696, nll_loss=3.124, ppl=8.72, wps=465358, ups=1.07, wpb=434595, bsz=16711.5, num_updates=5400, lr=0.000860663, gnorm=0.38, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=5359 epoch 004: 343 / 1689 loss=4.696, nll_loss=3.124, ppl=8.72, wps=465358, ups=1.07, wpb=434595, bsz=16711.5, num_updates=5400, lr=0.000860663, gnorm=0.38, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=5359 epoch 004: 443 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=466929, ups=1.08, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.38, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5452 epoch 004: 443 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=466929, ups=1.08, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.38, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5452 epoch 004: 443 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=466929, ups=1.08, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.38, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5452 epoch 004: 443 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=466929, ups=1.08, wpb=432435, bsz=16662.2, num_updates=5500, lr=0.000852803, gnorm=0.38, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5452 epoch 004: 543 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=467554, ups=1.08, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5545 epoch 004: 543 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=467554, ups=1.08, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5545 epoch 004: 543 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=467554, ups=1.08, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5545 epoch 004: 543 / 1689 loss=4.685, nll_loss=3.113, ppl=8.65, wps=467554, ups=1.08, wpb=433622, bsz=16605.2, num_updates=5600, lr=0.000845154, gnorm=0.393, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5545 epoch 004: 643 / 1689 loss=4.669, nll_loss=3.096, ppl=8.55, wps=462218, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.378, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=5639 epoch 004: 643 / 1689 loss=4.669, nll_loss=3.096, ppl=8.55, wps=462218, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.378, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=5639 epoch 004: 643 / 1689 loss=4.669, nll_loss=3.096, ppl=8.55, wps=462218, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.378, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=5639 epoch 004: 643 / 1689 loss=4.669, nll_loss=3.096, ppl=8.55, wps=462218, ups=1.06, wpb=434504, bsz=16524.6, num_updates=5700, lr=0.000837708, gnorm=0.378, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=5639 epoch 004: 743 / 1689 loss=4.664, nll_loss=3.09, ppl=8.51, wps=463452, ups=1.07, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.398, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5732 epoch 004: 743 / 1689 loss=4.664, nll_loss=3.09, ppl=8.51, wps=463452, ups=1.07, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.398, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5732 epoch 004: 743 / 1689 loss=4.664, nll_loss=3.09, ppl=8.51, wps=463452, ups=1.07, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.398, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5732 epoch 004: 743 / 1689 loss=4.664, nll_loss=3.09, ppl=8.51, wps=463452, ups=1.07, wpb=433836, bsz=16571.8, num_updates=5800, lr=0.000830455, gnorm=0.398, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5732 epoch 004: 843 / 1689 loss=4.653, nll_loss=3.078, ppl=8.44, wps=463931, ups=1.07, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.386, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=5826 epoch 004: 843 / 1689 loss=4.653, nll_loss=3.078, ppl=8.44, wps=463931, ups=1.07, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.386, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=5826 epoch 004: 843 / 1689 loss=4.653, nll_loss=3.078, ppl=8.44, wps=463931, ups=1.07, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.386, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=5826 epoch 004: 843 / 1689 loss=4.653, nll_loss=3.078, ppl=8.44, wps=463931, ups=1.07, wpb=435037, bsz=16307.4, num_updates=5900, lr=0.000823387, gnorm=0.386, clip=0, loss_scale=4, train_wall=92, gb_free=19, wall=5826 epoch 004: 944 / 1689 loss=4.643, nll_loss=3.067, ppl=8.38, wps=454022, ups=1.05, wpb=433430, bsz=16414.4, num_updates=6000, lr=0.000816497, gnorm=0.391, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=5922 epoch 004: 944 / 1689 loss=4.643, nll_loss=3.067, ppl=8.38, wps=454022, ups=1.05, wpb=433430, bsz=16414.4, num_updates=6000, lr=0.000816497, gnorm=0.391, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=5922 epoch 004: 944 / 1689 loss=4.643, nll_loss=3.067, ppl=8.38, wps=454022, ups=1.05, wpb=433430, bsz=16414.4, num_updates=6000, lr=0.000816497, gnorm=0.391, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=5922 epoch 004: 944 / 1689 loss=4.643, nll_loss=3.067, ppl=8.38, wps=454022, ups=1.05, wpb=433430, bsz=16414.4, num_updates=6000, lr=0.000816497, gnorm=0.391, clip=0, loss_scale=2, train_wall=94, gb_free=19, wall=5922 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.564 | nll_loss 2.917 | ppl 7.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.564 epoch 004 | valid on 'valid' subset | loss 4.564 | nll_loss 2.917 | ppl 7.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.564 epoch 004 | valid on 'valid' subset | loss 4.564 | nll_loss 2.917 | ppl 7.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.564 epoch 004 | valid on 'valid' subset | loss 4.564 | nll_loss 2.917 | ppl 7.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.564 epoch 004: 1044 / 1689 loss=4.645, nll_loss=3.07, ppl=8.4, wps=375097, ups=0.86, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.371, clip=0, loss_scale=2, train_wall=95, gb_free=19.5, wall=6037 epoch 004: 1044 / 1689 loss=4.645, nll_loss=3.07, ppl=8.4, wps=375097, ups=0.86, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.371, clip=0, loss_scale=2, train_wall=95, gb_free=19.5, wall=6037 epoch 004: 1044 / 1689 loss=4.645, nll_loss=3.07, ppl=8.4, wps=375097, ups=0.86, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.371, clip=0, loss_scale=2, train_wall=95, gb_free=19.5, wall=6037 epoch 004: 1044 / 1689 loss=4.645, nll_loss=3.07, ppl=8.4, wps=375097, ups=0.86, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.371, clip=0, loss_scale=2, train_wall=95, gb_free=19.5, wall=6037 epoch 004: 1144 / 1689 loss=4.636, nll_loss=3.06, ppl=8.34, wps=462018, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.391, clip=0, loss_scale=2, train_wall=92, gb_free=20.6, wall=6131 epoch 004: 1144 / 1689 loss=4.636, nll_loss=3.06, ppl=8.34, wps=462018, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.391, clip=0, loss_scale=2, train_wall=92, gb_free=20.6, wall=6131 epoch 004: 1144 / 1689 loss=4.636, nll_loss=3.06, ppl=8.34, wps=462018, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.391, clip=0, loss_scale=2, train_wall=92, gb_free=20.6, wall=6131 epoch 004: 1144 / 1689 loss=4.636, nll_loss=3.06, ppl=8.34, wps=462018, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.391, clip=0, loss_scale=2, train_wall=92, gb_free=20.6, wall=6131 epoch 004: 1244 / 1689 loss=4.629, nll_loss=3.053, ppl=8.3, wps=461015, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.378, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6226 epoch 004: 1244 / 1689 loss=4.629, nll_loss=3.053, ppl=8.3, wps=461015, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.378, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6226 epoch 004: 1244 / 1689 loss=4.629, nll_loss=3.053, ppl=8.3, wps=461015, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.378, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6226 epoch 004: 1244 / 1689 loss=4.629, nll_loss=3.053, ppl=8.3, wps=461015, ups=1.06, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.378, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6226 epoch 004: 1344 / 1689 loss=4.622, nll_loss=3.046, ppl=8.26, wps=463126, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.386, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=6319 epoch 004: 1344 / 1689 loss=4.622, nll_loss=3.046, ppl=8.26, wps=463126, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.386, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=6319 epoch 004: 1344 / 1689 loss=4.622, nll_loss=3.046, ppl=8.26, wps=463126, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.386, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=6319 epoch 004: 1344 / 1689 loss=4.622, nll_loss=3.046, ppl=8.26, wps=463126, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.386, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=6319 epoch 004: 1444 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=462544, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.389, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6413 epoch 004: 1444 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=462544, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.389, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6413 epoch 004: 1444 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=462544, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.389, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6413 epoch 004: 1444 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=462544, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.389, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6413 epoch 004: 1545 / 1689 loss=4.614, nll_loss=3.036, ppl=8.2, wps=460123, ups=1.06, wpb=434254, bsz=16269.4, num_updates=6600, lr=0.000778499, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6507 epoch 004: 1545 / 1689 loss=4.614, nll_loss=3.036, ppl=8.2, wps=460123, ups=1.06, wpb=434254, bsz=16269.4, num_updates=6600, lr=0.000778499, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6507 epoch 004: 1545 / 1689 loss=4.614, nll_loss=3.036, ppl=8.2, wps=460123, ups=1.06, wpb=434254, bsz=16269.4, num_updates=6600, lr=0.000778499, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6507 epoch 004: 1545 / 1689 loss=4.614, nll_loss=3.036, ppl=8.2, wps=460123, ups=1.06, wpb=434254, bsz=16269.4, num_updates=6600, lr=0.000778499, gnorm=0.375, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=6507 epoch 004: 1645 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=465522, ups=1.08, wpb=432726, bsz=16192.9, num_updates=6700, lr=0.000772667, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6600 epoch 004: 1645 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=465522, ups=1.08, wpb=432726, bsz=16192.9, num_updates=6700, lr=0.000772667, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6600 epoch 004: 1645 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=465522, ups=1.08, wpb=432726, bsz=16192.9, num_updates=6700, lr=0.000772667, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6600 epoch 004: 1645 / 1689 loss=4.615, nll_loss=3.038, ppl=8.21, wps=465522, ups=1.08, wpb=432726, bsz=16192.9, num_updates=6700, lr=0.000772667, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6600 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.654 | nll_loss 3.079 | ppl 8.45 | wps 456426 | ups 1.05 | wpb 433512 | bsz 16505.4 | num_updates 6744 | lr 0.000770143 | gnorm 0.385 | clip 0 | loss_scale 2 | train_wall 1560 | gb_free 19.6 | wall 6641 epoch 004 | loss 4.654 | nll_loss 3.079 | ppl 8.45 | wps 456426 | ups 1.05 | wpb 433512 | bsz 16505.4 | num_updates 6744 | lr 0.000770143 | gnorm 0.385 | clip 0 | loss_scale 2 | train_wall 1560 | gb_free 19.6 | wall 6641 epoch 004 | loss 4.654 | nll_loss 3.079 | ppl 8.45 | wps 456426 | ups 1.05 | wpb 433512 | bsz 16505.4 | num_updates 6744 | lr 0.000770143 | gnorm 0.385 | clip 0 | loss_scale 2 | train_wall 1560 | gb_free 19.6 | wall 6641 epoch 004 | loss 4.654 | nll_loss 3.079 | ppl 8.45 | wps 456426 | ups 1.05 | wpb 433512 | bsz 16505.4 | num_updates 6744 | lr 0.000770143 | gnorm 0.385 | clip 0 | loss_scale 2 | train_wall 1560 | gb_free 19.6 | wall 6641 Start iterating over samples epoch 005: 56 / 1689 loss=4.579, nll_loss=2.996, ppl=7.98, wps=461643, ups=1.08, wpb=429081, bsz=16287.7, num_updates=6800, lr=0.000766965, gnorm=0.366, clip=0, loss_scale=2, train_wall=91, gb_free=19.9, wall=6693 epoch 005: 56 / 1689 loss=4.579, nll_loss=2.996, ppl=7.98, wps=461643, ups=1.08, wpb=429081, bsz=16287.7, num_updates=6800, lr=0.000766965, gnorm=0.366, clip=0, loss_scale=2, train_wall=91, gb_free=19.9, wall=6693 epoch 005: 56 / 1689 loss=4.579, nll_loss=2.996, ppl=7.98, wps=461643, ups=1.08, wpb=429081, bsz=16287.7, num_updates=6800, lr=0.000766965, gnorm=0.366, clip=0, loss_scale=2, train_wall=91, gb_free=19.9, wall=6693 epoch 005: 56 / 1689 loss=4.579, nll_loss=2.996, ppl=7.98, wps=461643, ups=1.08, wpb=429081, bsz=16287.7, num_updates=6800, lr=0.000766965, gnorm=0.366, clip=0, loss_scale=2, train_wall=91, gb_free=19.9, wall=6693 epoch 005: 56 / 1689 loss=4.579, nll_loss=2.996, ppl=7.98, wps=461643, ups=1.08, wpb=429081, bsz=16287.7, num_updates=6800, lr=0.000766965, gnorm=0.366, clip=0, loss_scale=2, train_wall=91, gb_free=19.9, wall=6693 epoch 005: 156 / 1689 loss=4.574, nll_loss=2.991, ppl=7.95, wps=466160, ups=1.07, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6787 epoch 005: 156 / 1689 loss=4.574, nll_loss=2.991, ppl=7.95, wps=466160, ups=1.07, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6787 epoch 005: 156 / 1689 loss=4.574, nll_loss=2.991, ppl=7.95, wps=466160, ups=1.07, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6787 epoch 005: 156 / 1689 loss=4.574, nll_loss=2.991, ppl=7.95, wps=466160, ups=1.07, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6787 epoch 005: 156 / 1689 loss=4.574, nll_loss=2.991, ppl=7.95, wps=466160, ups=1.07, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.387, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=6787 epoch 005: 256 / 1689 loss=4.569, nll_loss=2.986, ppl=7.92, wps=462443, ups=1.07, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.382, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=6880 epoch 005: 256 / 1689 loss=4.569, nll_loss=2.986, ppl=7.92, wps=462443, ups=1.07, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.382, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=6880 epoch 005: 256 / 1689 loss=4.569, nll_loss=2.986, ppl=7.92, wps=462443, ups=1.07, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.382, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=6880 epoch 005: 256 / 1689 loss=4.569, nll_loss=2.986, ppl=7.92, wps=462443, ups=1.07, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.382, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=6880 epoch 005: 256 / 1689 loss=4.569, nll_loss=2.986, ppl=7.92, wps=462443, ups=1.07, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.382, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=6880 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.504 | nll_loss 2.855 | ppl 7.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.504 epoch 005 | valid on 'valid' subset | loss 4.504 | nll_loss 2.855 | ppl 7.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.504 epoch 005 | valid on 'valid' subset | loss 4.504 | nll_loss 2.855 | ppl 7.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.504 epoch 005 | valid on 'valid' subset | loss 4.504 | nll_loss 2.855 | ppl 7.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.504 epoch 005 | valid on 'valid' subset | loss 4.504 | nll_loss 2.855 | ppl 7.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.504 epoch 005: 357 / 1689 loss=4.559, nll_loss=2.975, ppl=7.87, wps=373788, ups=0.86, wpb=434585, bsz=16726.1, num_updates=7100, lr=0.000750587, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=6996 epoch 005: 357 / 1689 loss=4.559, nll_loss=2.975, ppl=7.87, wps=373788, ups=0.86, wpb=434585, bsz=16726.1, num_updates=7100, lr=0.000750587, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=6996 epoch 005: 357 / 1689 loss=4.559, nll_loss=2.975, ppl=7.87, wps=373788, ups=0.86, wpb=434585, bsz=16726.1, num_updates=7100, lr=0.000750587, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=6996 epoch 005: 357 / 1689 loss=4.559, nll_loss=2.975, ppl=7.87, wps=373788, ups=0.86, wpb=434585, bsz=16726.1, num_updates=7100, lr=0.000750587, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=6996 epoch 005: 357 / 1689 loss=4.559, nll_loss=2.975, ppl=7.87, wps=373788, ups=0.86, wpb=434585, bsz=16726.1, num_updates=7100, lr=0.000750587, gnorm=0.377, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=6996 epoch 005: 457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=459722, ups=1.06, wpb=432814, bsz=16631.4, num_updates=7200, lr=0.000745356, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=7090 epoch 005: 457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=459722, ups=1.06, wpb=432814, bsz=16631.4, num_updates=7200, lr=0.000745356, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=7090 epoch 005: 457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=459722, ups=1.06, wpb=432814, bsz=16631.4, num_updates=7200, lr=0.000745356, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=7090 epoch 005: 457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=459722, ups=1.06, wpb=432814, bsz=16631.4, num_updates=7200, lr=0.000745356, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=7090 epoch 005: 457 / 1689 loss=4.569, nll_loss=2.987, ppl=7.93, wps=459722, ups=1.06, wpb=432814, bsz=16631.4, num_updates=7200, lr=0.000745356, gnorm=0.362, clip=0, loss_scale=2, train_wall=93, gb_free=20.5, wall=7090 epoch 005: 557 / 1689 loss=4.555, nll_loss=2.972, ppl=7.84, wps=464194, ups=1.07, wpb=434183, bsz=16561, num_updates=7300, lr=0.000740233, gnorm=0.365, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=7184 epoch 005: 557 / 1689 loss=4.555, nll_loss=2.972, ppl=7.84, wps=464194, ups=1.07, wpb=434183, bsz=16561, num_updates=7300, lr=0.000740233, gnorm=0.365, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=7184 epoch 005: 557 / 1689 loss=4.555, nll_loss=2.972, ppl=7.84, wps=464194, ups=1.07, wpb=434183, bsz=16561, num_updates=7300, lr=0.000740233, gnorm=0.365, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=7184 epoch 005: 557 / 1689 loss=4.555, nll_loss=2.972, ppl=7.84, wps=464194, ups=1.07, wpb=434183, bsz=16561, num_updates=7300, lr=0.000740233, gnorm=0.365, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=7184 epoch 005: 557 / 1689 loss=4.555, nll_loss=2.972, ppl=7.84, wps=464194, ups=1.07, wpb=434183, bsz=16561, num_updates=7300, lr=0.000740233, gnorm=0.365, clip=0, loss_scale=2, train_wall=91, gb_free=20, wall=7184 epoch 005: 657 / 1689 loss=4.562, nll_loss=2.98, ppl=7.89, wps=461081, ups=1.06, wpb=433395, bsz=16480.2, num_updates=7400, lr=0.000735215, gnorm=0.372, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=7278 epoch 005: 657 / 1689 loss=4.562, nll_loss=2.98, ppl=7.89, wps=461081, ups=1.06, wpb=433395, bsz=16480.2, num_updates=7400, lr=0.000735215, gnorm=0.372, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=7278 epoch 005: 657 / 1689 loss=4.562, nll_loss=2.98, ppl=7.89, wps=461081, ups=1.06, wpb=433395, bsz=16480.2, num_updates=7400, lr=0.000735215, gnorm=0.372, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=7278 epoch 005: 657 / 1689 loss=4.562, nll_loss=2.98, ppl=7.89, wps=461081, ups=1.06, wpb=433395, bsz=16480.2, num_updates=7400, lr=0.000735215, gnorm=0.372, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=7278 epoch 005: 657 / 1689 loss=4.562, nll_loss=2.98, ppl=7.89, wps=461081, ups=1.06, wpb=433395, bsz=16480.2, num_updates=7400, lr=0.000735215, gnorm=0.372, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=7278 epoch 005: 757 / 1689 loss=4.553, nll_loss=2.969, ppl=7.83, wps=462518, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.359, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7371 epoch 005: 757 / 1689 loss=4.553, nll_loss=2.969, ppl=7.83, wps=462518, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.359, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7371 epoch 005: 757 / 1689 loss=4.553, nll_loss=2.969, ppl=7.83, wps=462518, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.359, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7371 epoch 005: 757 / 1689 loss=4.553, nll_loss=2.969, ppl=7.83, wps=462518, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.359, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7371 epoch 005: 757 / 1689 loss=4.553, nll_loss=2.969, ppl=7.83, wps=462518, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.359, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7371 epoch 005: 858 / 1689 loss=4.547, nll_loss=2.963, ppl=7.8, wps=460566, ups=1.06, wpb=434993, bsz=16481.4, num_updates=7600, lr=0.000725476, gnorm=0.376, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=7466 epoch 005: 858 / 1689 loss=4.547, nll_loss=2.963, ppl=7.8, wps=460566, ups=1.06, wpb=434993, bsz=16481.4, num_updates=7600, lr=0.000725476, gnorm=0.376, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=7466 epoch 005: 858 / 1689 loss=4.547, nll_loss=2.963, ppl=7.8, wps=460566, ups=1.06, wpb=434993, bsz=16481.4, num_updates=7600, lr=0.000725476, gnorm=0.376, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=7466 epoch 005: 858 / 1689 loss=4.547, nll_loss=2.963, ppl=7.8, wps=460566, ups=1.06, wpb=434993, bsz=16481.4, num_updates=7600, lr=0.000725476, gnorm=0.376, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=7466 epoch 005: 858 / 1689 loss=4.547, nll_loss=2.963, ppl=7.8, wps=460566, ups=1.06, wpb=434993, bsz=16481.4, num_updates=7600, lr=0.000725476, gnorm=0.376, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=7466 epoch 005: 958 / 1689 loss=4.551, nll_loss=2.969, ppl=7.83, wps=463083, ups=1.06, wpb=435473, bsz=16490.8, num_updates=7700, lr=0.00072075, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=7560 epoch 005: 958 / 1689 loss=4.551, nll_loss=2.969, ppl=7.83, wps=463083, ups=1.06, wpb=435473, bsz=16490.8, num_updates=7700, lr=0.00072075, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=7560 epoch 005: 958 / 1689 loss=4.551, nll_loss=2.969, ppl=7.83, wps=463083, ups=1.06, wpb=435473, bsz=16490.8, num_updates=7700, lr=0.00072075, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=7560 epoch 005: 958 / 1689 loss=4.551, nll_loss=2.969, ppl=7.83, wps=463083, ups=1.06, wpb=435473, bsz=16490.8, num_updates=7700, lr=0.00072075, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=7560 epoch 005: 958 / 1689 loss=4.551, nll_loss=2.969, ppl=7.83, wps=463083, ups=1.06, wpb=435473, bsz=16490.8, num_updates=7700, lr=0.00072075, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=7560 epoch 005: 1058 / 1689 loss=4.533, nll_loss=2.948, ppl=7.72, wps=465587, ups=1.07, wpb=434155, bsz=16673.1, num_updates=7800, lr=0.000716115, gnorm=0.369, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=7653 epoch 005: 1058 / 1689 loss=4.533, nll_loss=2.948, ppl=7.72, wps=465587, ups=1.07, wpb=434155, bsz=16673.1, num_updates=7800, lr=0.000716115, gnorm=0.369, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=7653 epoch 005: 1058 / 1689 loss=4.533, nll_loss=2.948, ppl=7.72, wps=465587, ups=1.07, wpb=434155, bsz=16673.1, num_updates=7800, lr=0.000716115, gnorm=0.369, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=7653 epoch 005: 1058 / 1689 loss=4.533, nll_loss=2.948, ppl=7.72, wps=465587, ups=1.07, wpb=434155, bsz=16673.1, num_updates=7800, lr=0.000716115, gnorm=0.369, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=7653 epoch 005: 1058 / 1689 loss=4.533, nll_loss=2.948, ppl=7.72, wps=465587, ups=1.07, wpb=434155, bsz=16673.1, num_updates=7800, lr=0.000716115, gnorm=0.369, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=7653 epoch 005: 1158 / 1689 loss=4.534, nll_loss=2.949, ppl=7.72, wps=459518, ups=1.06, wpb=435287, bsz=16668.7, num_updates=7900, lr=0.000711568, gnorm=0.359, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=7748 epoch 005: 1158 / 1689 loss=4.534, nll_loss=2.949, ppl=7.72, wps=459518, ups=1.06, wpb=435287, bsz=16668.7, num_updates=7900, lr=0.000711568, gnorm=0.359, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=7748 epoch 005: 1158 / 1689 loss=4.534, nll_loss=2.949, ppl=7.72, wps=459518, ups=1.06, wpb=435287, bsz=16668.7, num_updates=7900, lr=0.000711568, gnorm=0.359, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=7748 epoch 005: 1158 / 1689 loss=4.534, nll_loss=2.949, ppl=7.72, wps=459518, ups=1.06, wpb=435287, bsz=16668.7, num_updates=7900, lr=0.000711568, gnorm=0.359, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=7748 epoch 005: 1158 / 1689 loss=4.534, nll_loss=2.949, ppl=7.72, wps=459518, ups=1.06, wpb=435287, bsz=16668.7, num_updates=7900, lr=0.000711568, gnorm=0.359, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=7748 epoch 005: 1258 / 1689 loss=4.538, nll_loss=2.954, ppl=7.75, wps=465318, ups=1.07, wpb=434466, bsz=16407.4, num_updates=8000, lr=0.000707107, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=7841 epoch 005: 1258 / 1689 loss=4.538, nll_loss=2.954, ppl=7.75, wps=465318, ups=1.07, wpb=434466, bsz=16407.4, num_updates=8000, lr=0.000707107, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=7841 epoch 005: 1258 / 1689 loss=4.538, nll_loss=2.954, ppl=7.75, wps=465318, ups=1.07, wpb=434466, bsz=16407.4, num_updates=8000, lr=0.000707107, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=7841 epoch 005: 1258 / 1689 loss=4.538, nll_loss=2.954, ppl=7.75, wps=465318, ups=1.07, wpb=434466, bsz=16407.4, num_updates=8000, lr=0.000707107, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=7841 epoch 005: 1258 / 1689 loss=4.538, nll_loss=2.954, ppl=7.75, wps=465318, ups=1.07, wpb=434466, bsz=16407.4, num_updates=8000, lr=0.000707107, gnorm=0.371, clip=0, loss_scale=1, train_wall=92, gb_free=18, wall=7841 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.453 | nll_loss 2.812 | ppl 7.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.453 epoch 005 | valid on 'valid' subset | loss 4.453 | nll_loss 2.812 | ppl 7.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.453 epoch 005 | valid on 'valid' subset | loss 4.453 | nll_loss 2.812 | ppl 7.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.453 epoch 005 | valid on 'valid' subset | loss 4.453 | nll_loss 2.812 | ppl 7.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.453 epoch 005 | valid on 'valid' subset | loss 4.453 | nll_loss 2.812 | ppl 7.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.453 epoch 005: 1358 / 1689 loss=4.532, nll_loss=2.948, ppl=7.72, wps=331468, ups=0.76, wpb=434720, bsz=16684.8, num_updates=8100, lr=0.000702728, gnorm=0.356, clip=0, loss_scale=2, train_wall=109, gb_free=19.5, wall=7972 epoch 005: 1358 / 1689 loss=4.532, nll_loss=2.948, ppl=7.72, wps=331468, ups=0.76, wpb=434720, bsz=16684.8, num_updates=8100, lr=0.000702728, gnorm=0.356, clip=0, loss_scale=2, train_wall=109, gb_free=19.5, wall=7972 epoch 005: 1358 / 1689 loss=4.532, nll_loss=2.948, ppl=7.72, wps=331468, ups=0.76, wpb=434720, bsz=16684.8, num_updates=8100, lr=0.000702728, gnorm=0.356, clip=0, loss_scale=2, train_wall=109, gb_free=19.5, wall=7972 epoch 005: 1358 / 1689 loss=4.532, nll_loss=2.948, ppl=7.72, wps=331468, ups=0.76, wpb=434720, bsz=16684.8, num_updates=8100, lr=0.000702728, gnorm=0.356, clip=0, loss_scale=2, train_wall=109, gb_free=19.5, wall=7972 epoch 005: 1358 / 1689 loss=4.532, nll_loss=2.948, ppl=7.72, wps=331468, ups=0.76, wpb=434720, bsz=16684.8, num_updates=8100, lr=0.000702728, gnorm=0.356, clip=0, loss_scale=2, train_wall=109, gb_free=19.5, wall=7972 epoch 005: 1458 / 1689 loss=4.527, nll_loss=2.942, ppl=7.69, wps=463137, ups=1.07, wpb=433094, bsz=16449.5, num_updates=8200, lr=0.00069843, gnorm=0.37, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=8066 epoch 005: 1458 / 1689 loss=4.527, nll_loss=2.942, ppl=7.69, wps=463137, ups=1.07, wpb=433094, bsz=16449.5, num_updates=8200, lr=0.00069843, gnorm=0.37, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=8066 epoch 005: 1458 / 1689 loss=4.527, nll_loss=2.942, ppl=7.69, wps=463137, ups=1.07, wpb=433094, bsz=16449.5, num_updates=8200, lr=0.00069843, gnorm=0.37, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=8066 epoch 005: 1458 / 1689 loss=4.527, nll_loss=2.942, ppl=7.69, wps=463137, ups=1.07, wpb=433094, bsz=16449.5, num_updates=8200, lr=0.00069843, gnorm=0.37, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=8066 epoch 005: 1458 / 1689 loss=4.527, nll_loss=2.942, ppl=7.69, wps=463137, ups=1.07, wpb=433094, bsz=16449.5, num_updates=8200, lr=0.00069843, gnorm=0.37, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=8066 epoch 005: 1558 / 1689 loss=4.519, nll_loss=2.933, ppl=7.64, wps=467215, ups=1.08, wpb=432099, bsz=16198.6, num_updates=8300, lr=0.00069421, gnorm=0.369, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8158 epoch 005: 1558 / 1689 loss=4.519, nll_loss=2.933, ppl=7.64, wps=467215, ups=1.08, wpb=432099, bsz=16198.6, num_updates=8300, lr=0.00069421, gnorm=0.369, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8158 epoch 005: 1558 / 1689 loss=4.519, nll_loss=2.933, ppl=7.64, wps=467215, ups=1.08, wpb=432099, bsz=16198.6, num_updates=8300, lr=0.00069421, gnorm=0.369, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8158 epoch 005: 1558 / 1689 loss=4.519, nll_loss=2.933, ppl=7.64, wps=467215, ups=1.08, wpb=432099, bsz=16198.6, num_updates=8300, lr=0.00069421, gnorm=0.369, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8158 epoch 005: 1558 / 1689 loss=4.519, nll_loss=2.933, ppl=7.64, wps=467215, ups=1.08, wpb=432099, bsz=16198.6, num_updates=8300, lr=0.00069421, gnorm=0.369, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8158 epoch 005: 1658 / 1689 loss=4.526, nll_loss=2.942, ppl=7.69, wps=462882, ups=1.07, wpb=432882, bsz=16899.8, num_updates=8400, lr=0.000690066, gnorm=0.359, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=8252 epoch 005: 1658 / 1689 loss=4.526, nll_loss=2.942, ppl=7.69, wps=462882, ups=1.07, wpb=432882, bsz=16899.8, num_updates=8400, lr=0.000690066, gnorm=0.359, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=8252 epoch 005: 1658 / 1689 loss=4.526, nll_loss=2.942, ppl=7.69, wps=462882, ups=1.07, wpb=432882, bsz=16899.8, num_updates=8400, lr=0.000690066, gnorm=0.359, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=8252 epoch 005: 1658 / 1689 loss=4.526, nll_loss=2.942, ppl=7.69, wps=462882, ups=1.07, wpb=432882, bsz=16899.8, num_updates=8400, lr=0.000690066, gnorm=0.359, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=8252 epoch 005: 1658 / 1689 loss=4.526, nll_loss=2.942, ppl=7.69, wps=462882, ups=1.07, wpb=432882, bsz=16899.8, num_updates=8400, lr=0.000690066, gnorm=0.359, clip=0, loss_scale=2, train_wall=93, gb_free=17.8, wall=8252 end of epoch 5 (average epoch stats below) epoch 005 | loss 4.547 | nll_loss 2.963 | ppl 7.8 | wps 446239 | ups 1.03 | wpb 433538 | bsz 16506.6 | num_updates 8431 | lr 0.000688796 | gnorm 0.369 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 23.9 | wall 8280 epoch 005 | loss 4.547 | nll_loss 2.963 | ppl 7.8 | wps 446239 | ups 1.03 | wpb 433538 | bsz 16506.6 | num_updates 8431 | lr 0.000688796 | gnorm 0.369 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 23.9 | wall 8280 epoch 005 | loss 4.547 | nll_loss 2.963 | ppl 7.8 | wps 446239 | ups 1.03 | wpb 433538 | bsz 16506.6 | num_updates 8431 | lr 0.000688796 | gnorm 0.369 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 23.9 | wall 8280 epoch 005 | loss 4.547 | nll_loss 2.963 | ppl 7.8 | wps 446239 | ups 1.03 | wpb 433538 | bsz 16506.6 | num_updates 8431 | lr 0.000688796 | gnorm 0.369 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 23.9 | wall 8280 epoch 005 | loss 4.547 | nll_loss 2.963 | ppl 7.8 | wps 446239 | ups 1.03 | wpb 433538 | bsz 16506.6 | num_updates 8431 | lr 0.000688796 | gnorm 0.369 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 23.9 | wall 8280 Start iterating over samples epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 69 / 1689 loss=4.489, nll_loss=2.899, ppl=7.46, wps=462756, ups=1.08, wpb=430062, bsz=16039.8, num_updates=8500, lr=0.000685994, gnorm=0.363, clip=0, loss_scale=2, train_wall=91, gb_free=20.2, wall=8345 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 170 / 1689 loss=4.497, nll_loss=2.908, ppl=7.51, wps=463521, ups=1.07, wpb=435003, bsz=16356.6, num_updates=8600, lr=0.000681994, gnorm=0.35, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=8439 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 270 / 1689 loss=4.482, nll_loss=2.892, ppl=7.42, wps=464204, ups=1.07, wpb=433064, bsz=16611.5, num_updates=8700, lr=0.000678064, gnorm=0.351, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=8532 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 370 / 1689 loss=4.493, nll_loss=2.904, ppl=7.48, wps=461117, ups=1.06, wpb=433504, bsz=16333.4, num_updates=8800, lr=0.0006742, gnorm=0.351, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=8626 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 470 / 1689 loss=4.486, nll_loss=2.896, ppl=7.45, wps=459968, ups=1.06, wpb=432115, bsz=16431.3, num_updates=8900, lr=0.000670402, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8720 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 epoch 006: 570 / 1689 loss=4.483, nll_loss=2.894, ppl=7.43, wps=462874, ups=1.06, wpb=435174, bsz=16641.6, num_updates=9000, lr=0.000666667, gnorm=0.36, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=8814 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006 | valid on 'valid' subset | loss 4.428 | nll_loss 2.779 | ppl 6.87 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.428 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 671 / 1689 loss=4.482, nll_loss=2.893, ppl=7.43, wps=374470, ups=0.87, wpb=432273, bsz=16874, num_updates=9100, lr=0.000662994, gnorm=0.345, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=8929 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 771 / 1689 loss=4.479, nll_loss=2.889, ppl=7.41, wps=461638, ups=1.07, wpb=433356, bsz=16388.8, num_updates=9200, lr=0.00065938, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=20.7, wall=9023 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 871 / 1689 loss=4.493, nll_loss=2.905, ppl=7.49, wps=460997, ups=1.06, wpb=433893, bsz=16833.3, num_updates=9300, lr=0.000655826, gnorm=0.349, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=9117 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 971 / 1689 loss=4.481, nll_loss=2.892, ppl=7.42, wps=462200, ups=1.06, wpb=434361, bsz=16305.7, num_updates=9400, lr=0.000652328, gnorm=0.345, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=9211 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1071 / 1689 loss=4.475, nll_loss=2.885, ppl=7.39, wps=463822, ups=1.07, wpb=432636, bsz=16451.7, num_updates=9500, lr=0.000648886, gnorm=0.364, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=9305 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1172 / 1689 loss=4.474, nll_loss=2.885, ppl=7.39, wps=459198, ups=1.06, wpb=434900, bsz=16630.9, num_updates=9600, lr=0.000645497, gnorm=0.348, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9399 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1272 / 1689 loss=4.468, nll_loss=2.877, ppl=7.35, wps=462602, ups=1.07, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.343, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=9493 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1372 / 1689 loss=4.475, nll_loss=2.886, ppl=7.39, wps=465572, ups=1.08, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.359, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=9586 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1472 / 1689 loss=4.477, nll_loss=2.889, ppl=7.41, wps=464146, ups=1.06, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.33, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9680 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 epoch 006: 1572 / 1689 loss=4.471, nll_loss=2.882, ppl=7.37, wps=462614, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.34, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9774 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.755 | ppl 6.75 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.402 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 epoch 006: 1672 / 1689 loss=4.454, nll_loss=2.863, ppl=7.28, wps=380289, ups=0.88, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.354, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=9888 end of epoch 6 (average epoch stats below) epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 epoch 006 | loss 4.479 | nll_loss 2.889 | ppl 7.41 | wps 450294 | ups 1.04 | wpb 433556 | bsz 16506.3 | num_updates 10117 | lr 0.000628788 | gnorm 0.352 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 22.5 | wall 9903 Start iterating over samples epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 83 / 1689 loss=4.437, nll_loss=2.842, ppl=7.17, wps=452113, ups=1.05, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.349, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=9983 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 183 / 1689 loss=4.428, nll_loss=2.832, ppl=7.12, wps=462941, ups=1.07, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.341, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=10077 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 283 / 1689 loss=4.433, nll_loss=2.839, ppl=7.15, wps=459731, ups=1.06, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.326, clip=0, loss_scale=2, train_wall=93, gb_free=19.3, wall=10171 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 384 / 1689 loss=4.442, nll_loss=2.849, ppl=7.2, wps=461444, ups=1.06, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.334, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=10265 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 484 / 1689 loss=4.432, nll_loss=2.838, ppl=7.15, wps=463182, ups=1.07, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.339, clip=0, loss_scale=1, train_wall=92, gb_free=17.9, wall=10359 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 584 / 1689 loss=4.426, nll_loss=2.83, ppl=7.11, wps=459095, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.344, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=10453 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 684 / 1689 loss=4.428, nll_loss=2.833, ppl=7.13, wps=460264, ups=1.06, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.325, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=10547 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 784 / 1689 loss=4.443, nll_loss=2.851, ppl=7.21, wps=460564, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.343, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=10641 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 epoch 007: 884 / 1689 loss=4.442, nll_loss=2.85, ppl=7.21, wps=463080, ups=1.07, wpb=433106, bsz=16498.1, num_updates=11000, lr=0.000603023, gnorm=0.336, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=10735 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007 | valid on 'valid' subset | loss 4.388 | nll_loss 2.743 | ppl 6.69 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.388 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 984 / 1689 loss=4.425, nll_loss=2.831, ppl=7.11, wps=294954, ups=0.68, wpb=434160, bsz=16326.5, num_updates=11100, lr=0.0006003, gnorm=0.359, clip=0, loss_scale=2, train_wall=123, gb_free=19.3, wall=10882 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1085 / 1689 loss=4.419, nll_loss=2.824, ppl=7.08, wps=457193, ups=1.06, wpb=432332, bsz=16993.1, num_updates=11200, lr=0.000597614, gnorm=0.339, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=10977 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1185 / 1689 loss=4.428, nll_loss=2.835, ppl=7.14, wps=461211, ups=1.06, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11071 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1285 / 1689 loss=4.429, nll_loss=2.835, ppl=7.14, wps=463059, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.332, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11165 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1385 / 1689 loss=4.437, nll_loss=2.845, ppl=7.19, wps=462877, ups=1.07, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.323, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=11259 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1485 / 1689 loss=4.434, nll_loss=2.842, ppl=7.17, wps=460720, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.322, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=11353 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1585 / 1689 loss=4.426, nll_loss=2.833, ppl=7.13, wps=457948, ups=1.06, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.346, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=11448 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 epoch 007: 1685 / 1689 loss=4.42, nll_loss=2.826, ppl=7.09, wps=457860, ups=1.06, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11542 end of epoch 7 (average epoch stats below) epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 epoch 007 | loss 4.431 | nll_loss 2.837 | ppl 7.14 | wps 445327 | ups 1.03 | wpb 433529 | bsz 16505.3 | num_updates 11804 | lr 0.000582124 | gnorm 0.335 | clip 0 | loss_scale 2 | train_wall 1592 | gb_free 19.6 | wall 11545 Start iterating over samples epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 96 / 1689 loss=4.383, nll_loss=2.783, ppl=6.88, wps=456925, ups=1.06, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.342, clip=0, loss_scale=2, train_wall=92, gb_free=20.4, wall=11636 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 epoch 008: 196 / 1689 loss=4.397, nll_loss=2.799, ppl=6.96, wps=463926, ups=1.07, wpb=434281, bsz=16559, num_updates=12000, lr=0.00057735, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11730 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008 | valid on 'valid' subset | loss 4.366 | nll_loss 2.722 | ppl 6.6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.366 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 296 / 1689 loss=4.4, nll_loss=2.803, ppl=6.98, wps=386127, ups=0.89, wpb=434536, bsz=16604.2, num_updates=12100, lr=0.00057496, gnorm=0.324, clip=0, loss_scale=2, train_wall=92, gb_free=21.7, wall=11843 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 396 / 1689 loss=4.408, nll_loss=2.811, ppl=7.02, wps=461764, ups=1.06, wpb=433774, bsz=16639.4, num_updates=12200, lr=0.000572598, gnorm=0.325, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=11936 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 498 / 1689 loss=4.385, nll_loss=2.786, ppl=6.9, wps=449320, ups=1.04, wpb=431035, bsz=15990.2, num_updates=12300, lr=0.000570266, gnorm=0.329, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=12032 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 598 / 1689 loss=4.399, nll_loss=2.801, ppl=6.97, wps=459047, ups=1.07, wpb=430165, bsz=16446.9, num_updates=12400, lr=0.000567962, gnorm=0.334, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=12126 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 698 / 1689 loss=4.384, nll_loss=2.785, ppl=6.89, wps=459470, ups=1.06, wpb=433210, bsz=16546.2, num_updates=12500, lr=0.000565685, gnorm=0.309, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=12220 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 798 / 1689 loss=4.395, nll_loss=2.798, ppl=6.95, wps=468398, ups=1.08, wpb=434550, bsz=16399.4, num_updates=12600, lr=0.000563436, gnorm=0.311, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=12313 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 898 / 1689 loss=4.407, nll_loss=2.812, ppl=7.02, wps=471485, ups=1.08, wpb=434772, bsz=16619.1, num_updates=12700, lr=0.000561214, gnorm=0.332, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=12405 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 998 / 1689 loss=4.409, nll_loss=2.815, ppl=7.04, wps=469149, ups=1.07, wpb=436632, bsz=16354.6, num_updates=12800, lr=0.000559017, gnorm=0.304, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=12498 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1098 / 1689 loss=4.39, nll_loss=2.792, ppl=6.93, wps=466640, ups=1.07, wpb=434319, bsz=16411.7, num_updates=12900, lr=0.000556846, gnorm=0.329, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=12592 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 epoch 008: 1198 / 1689 loss=4.387, nll_loss=2.789, ppl=6.91, wps=464752, ups=1.07, wpb=433822, bsz=16268.2, num_updates=13000, lr=0.0005547, gnorm=0.301, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=12685 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008 | valid on 'valid' subset | loss 4.357 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.357 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1299 / 1689 loss=4.397, nll_loss=2.801, ppl=6.97, wps=357735, ups=0.82, wpb=434783, bsz=16684.1, num_updates=13100, lr=0.000552579, gnorm=0.304, clip=0, loss_scale=1, train_wall=98, gb_free=19.2, wall=12806 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1399 / 1689 loss=4.393, nll_loss=2.797, ppl=6.95, wps=464562, ups=1.07, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=12900 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1499 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=464712, ups=1.07, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.319, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=12993 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 epoch 008: 1599 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=465898, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.328, clip=0, loss_scale=1, train_wall=91, gb_free=18.9, wall=13086 end of epoch 8 (average epoch stats below) epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 epoch 008 | loss 4.394 | nll_loss 2.797 | ppl 6.95 | wps 450072 | ups 1.04 | wpb 433514 | bsz 16500.8 | num_updates 13490 | lr 0.000544533 | gnorm 0.32 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 20.2 | wall 13169 Start iterating over samples epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 10 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=460472, ups=1.07, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.321, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=13179 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 110 / 1689 loss=4.351, nll_loss=2.748, ppl=6.72, wps=463119, ups=1.07, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.305, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13272 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 210 / 1689 loss=4.365, nll_loss=2.765, ppl=6.8, wps=461292, ups=1.06, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.313, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13367 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 311 / 1689 loss=4.37, nll_loss=2.77, ppl=6.82, wps=456410, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.311, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=13462 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 411 / 1689 loss=4.362, nll_loss=2.761, ppl=6.78, wps=464803, ups=1.07, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.306, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=13556 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 epoch 009: 511 / 1689 loss=4.365, nll_loss=2.764, ppl=6.79, wps=461332, ups=1.07, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.32, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=13649 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009 | valid on 'valid' subset | loss 4.35 | nll_loss 2.704 | ppl 6.52 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.35 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 611 / 1689 loss=4.373, nll_loss=2.773, ppl=6.84, wps=355917, ups=0.82, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.314, clip=0, loss_scale=1, train_wall=100, gb_free=19.1, wall=13771 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 711 / 1689 loss=4.377, nll_loss=2.779, ppl=6.86, wps=465517, ups=1.07, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.298, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=13865 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 812 / 1689 loss=4.363, nll_loss=2.763, ppl=6.79, wps=457413, ups=1.06, wpb=432876, bsz=16633.4, num_updates=14300, lr=0.000528886, gnorm=0.324, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=13959 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 912 / 1689 loss=4.362, nll_loss=2.762, ppl=6.78, wps=459792, ups=1.06, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.307, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14054 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1012 / 1689 loss=4.352, nll_loss=2.751, ppl=6.73, wps=458629, ups=1.06, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.298, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14148 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1112 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=462756, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.301, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=14242 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1212 / 1689 loss=4.354, nll_loss=2.753, ppl=6.74, wps=459160, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.301, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14336 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1312 / 1689 loss=4.361, nll_loss=2.761, ppl=6.78, wps=459378, ups=1.06, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.297, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=14430 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1412 / 1689 loss=4.364, nll_loss=2.765, ppl=6.8, wps=457784, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.3, clip=0, loss_scale=2, train_wall=93, gb_free=18, wall=14525 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 epoch 009: 1512 / 1689 loss=4.356, nll_loss=2.756, ppl=6.75, wps=460311, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.299, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14619 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009 | valid on 'valid' subset | loss 4.326 | nll_loss 2.679 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.326 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 epoch 009: 1612 / 1689 loss=4.377, nll_loss=2.78, ppl=6.87, wps=389004, ups=0.89, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.307, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=14731 end of epoch 9 (average epoch stats below) epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 epoch 009 | loss 4.364 | nll_loss 2.764 | ppl 6.79 | wps 447775 | ups 1.03 | wpb 433536 | bsz 16502.7 | num_updates 15177 | lr 0.000513378 | gnorm 0.307 | clip 0 | loss_scale 2 | train_wall 1569 | gb_free 20.3 | wall 14802 Start iterating over samples epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 23 / 1689 loss=4.371, nll_loss=2.772, ppl=6.83, wps=460012, ups=1.07, wpb=431345, bsz=16684.3, num_updates=15200, lr=0.000512989, gnorm=0.318, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=14825 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 124 / 1689 loss=4.34, nll_loss=2.736, ppl=6.66, wps=455228, ups=1.05, wpb=434222, bsz=16441, num_updates=15300, lr=0.00051131, gnorm=0.288, clip=0, loss_scale=2, train_wall=94, gb_free=20, wall=14920 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 224 / 1689 loss=4.331, nll_loss=2.727, ppl=6.62, wps=460835, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.292, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=15014 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 325 / 1689 loss=4.338, nll_loss=2.735, ppl=6.66, wps=460410, ups=1.06, wpb=433990, bsz=16487.9, num_updates=15500, lr=0.000508001, gnorm=0.312, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=15108 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 425 / 1689 loss=4.335, nll_loss=2.732, ppl=6.64, wps=464211, ups=1.07, wpb=434605, bsz=16629.3, num_updates=15600, lr=0.00050637, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15202 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 525 / 1689 loss=4.337, nll_loss=2.734, ppl=6.65, wps=462149, ups=1.07, wpb=432091, bsz=16260.2, num_updates=15700, lr=0.000504754, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15296 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 625 / 1689 loss=4.34, nll_loss=2.738, ppl=6.67, wps=462829, ups=1.07, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.31, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=15389 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 725 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=462548, ups=1.07, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.292, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15483 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 epoch 010: 826 / 1689 loss=4.336, nll_loss=2.733, ppl=6.65, wps=460394, ups=1.06, wpb=435303, bsz=16354.6, num_updates=16000, lr=0.0005, gnorm=0.275, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=15577 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010 | valid on 'valid' subset | loss 4.335 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.326 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 926 / 1689 loss=4.348, nll_loss=2.747, ppl=6.72, wps=366372, ups=0.84, wpb=436032, bsz=16771.8, num_updates=16100, lr=0.000498445, gnorm=0.295, clip=0, loss_scale=1, train_wall=103, gb_free=18.9, wall=15696 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1026 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=466433, ups=1.08, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.296, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=15789 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1126 / 1689 loss=4.335, nll_loss=2.733, ppl=6.65, wps=460990, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=15883 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1226 / 1689 loss=4.343, nll_loss=2.741, ppl=6.69, wps=461487, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.292, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=15977 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1326 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461811, ups=1.06, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.302, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=16071 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1426 / 1689 loss=4.355, nll_loss=2.755, ppl=6.75, wps=466808, ups=1.08, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.295, clip=0, loss_scale=2, train_wall=91, gb_free=20.5, wall=16164 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1527 / 1689 loss=4.344, nll_loss=2.743, ppl=6.69, wps=458642, ups=1.06, wpb=432743, bsz=16236.9, num_updates=16700, lr=0.000489409, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=16258 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 epoch 010: 1627 / 1689 loss=4.349, nll_loss=2.748, ppl=6.72, wps=462286, ups=1.06, wpb=434982, bsz=16340.3, num_updates=16800, lr=0.00048795, gnorm=0.308, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=16352 end of epoch 10 (average epoch stats below) epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 epoch 010 | loss 4.34 | nll_loss 2.738 | ppl 6.67 | wps 454704 | ups 1.05 | wpb 433536 | bsz 16501.4 | num_updates 16862 | lr 0.000487052 | gnorm 0.297 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21.2 | wall 16409 Start iterating over samples epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 38 / 1689 loss=4.332, nll_loss=2.729, ppl=6.63, wps=461137, ups=1.07, wpb=432094, bsz=15973.4, num_updates=16900, lr=0.000486504, gnorm=0.288, clip=0, loss_scale=1, train_wall=91, gb_free=20, wall=16446 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 epoch 011: 138 / 1689 loss=4.305, nll_loss=2.699, ppl=6.49, wps=463879, ups=1.07, wpb=434386, bsz=16788.3, num_updates=17000, lr=0.000485071, gnorm=0.287, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=16539 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011 | valid on 'valid' subset | loss 4.325 | nll_loss 2.671 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.325 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 238 / 1689 loss=4.309, nll_loss=2.703, ppl=6.51, wps=379982, ups=0.88, wpb=431038, bsz=16513.8, num_updates=17100, lr=0.000483651, gnorm=0.287, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=16653 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 338 / 1689 loss=4.313, nll_loss=2.708, ppl=6.53, wps=468474, ups=1.08, wpb=433201, bsz=16930.6, num_updates=17200, lr=0.000482243, gnorm=0.285, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=16745 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 438 / 1689 loss=4.323, nll_loss=2.719, ppl=6.59, wps=468099, ups=1.08, wpb=434781, bsz=16284.2, num_updates=17300, lr=0.000480847, gnorm=0.3, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=16838 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 538 / 1689 loss=4.324, nll_loss=2.721, ppl=6.59, wps=469029, ups=1.08, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.298, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=16931 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 638 / 1689 loss=4.321, nll_loss=2.717, ppl=6.58, wps=465961, ups=1.07, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.285, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=17025 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 738 / 1689 loss=4.329, nll_loss=2.726, ppl=6.62, wps=460182, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.284, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=17119 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 838 / 1689 loss=4.317, nll_loss=2.712, ppl=6.55, wps=464536, ups=1.07, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.276, clip=0, loss_scale=2, train_wall=92, gb_free=17.4, wall=17213 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 940 / 1689 loss=4.315, nll_loss=2.71, ppl=6.55, wps=451987, ups=1.05, wpb=431048, bsz=16433.4, num_updates=17800, lr=0.000474045, gnorm=0.292, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=17308 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1040 / 1689 loss=4.333, nll_loss=2.731, ppl=6.64, wps=461972, ups=1.06, wpb=434476, bsz=16732.8, num_updates=17900, lr=0.000472719, gnorm=0.289, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=17402 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 epoch 011: 1140 / 1689 loss=4.317, nll_loss=2.713, ppl=6.56, wps=462316, ups=1.07, wpb=433618, bsz=16550.8, num_updates=18000, lr=0.000471405, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=17496 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011 | valid on 'valid' subset | loss 4.317 | nll_loss 2.675 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.317 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1240 / 1689 loss=4.31, nll_loss=2.705, ppl=6.52, wps=123979, ups=0.29, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=1, train_wall=202, gb_free=19.9, wall=17845 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1340 / 1689 loss=4.316, nll_loss=2.713, ppl=6.55, wps=470173, ups=1.08, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.297, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=17937 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1440 / 1689 loss=4.326, nll_loss=2.723, ppl=6.6, wps=463134, ups=1.07, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.294, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=18030 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1541 / 1689 loss=4.321, nll_loss=2.718, ppl=6.58, wps=460596, ups=1.06, wpb=434434, bsz=16003, num_updates=18400, lr=0.000466252, gnorm=0.281, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=18125 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 epoch 011: 1641 / 1689 loss=4.32, nll_loss=2.717, ppl=6.58, wps=467282, ups=1.08, wpb=432159, bsz=16487.4, num_updates=18500, lr=0.000464991, gnorm=0.271, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=18217 end of epoch 11 (average epoch stats below) epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 epoch 011 | loss 4.319 | nll_loss 2.715 | ppl 6.57 | wps 394637 | ups 0.91 | wpb 433535 | bsz 16502.5 | num_updates 18548 | lr 0.000464388 | gnorm 0.286 | clip 0 | loss_scale 1 | train_wall 1666 | gb_free 20.8 | wall 18261 Start iterating over samples epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 52 / 1689 loss=4.305, nll_loss=2.7, ppl=6.5, wps=460872, ups=1.07, wpb=429280, bsz=16363.1, num_updates=18600, lr=0.000463739, gnorm=0.282, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=18310 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 152 / 1689 loss=4.289, nll_loss=2.681, ppl=6.41, wps=462752, ups=1.07, wpb=432865, bsz=16193.2, num_updates=18700, lr=0.000462497, gnorm=0.285, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=18404 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 252 / 1689 loss=4.293, nll_loss=2.686, ppl=6.43, wps=462385, ups=1.07, wpb=433212, bsz=16798.2, num_updates=18800, lr=0.000461266, gnorm=0.288, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18498 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 352 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=462273, ups=1.07, wpb=433517, bsz=16223, num_updates=18900, lr=0.000460044, gnorm=0.28, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=18591 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 epoch 012: 452 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=463494, ups=1.07, wpb=433953, bsz=16367, num_updates=19000, lr=0.000458831, gnorm=0.282, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=18685 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.307 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.307 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 553 / 1689 loss=4.298, nll_loss=2.691, ppl=6.46, wps=375848, ups=0.87, wpb=431385, bsz=16627.6, num_updates=19100, lr=0.000457629, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=18800 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 653 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=459829, ups=1.06, wpb=434520, bsz=16597.1, num_updates=19200, lr=0.000456435, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=18894 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 753 / 1689 loss=4.302, nll_loss=2.696, ppl=6.48, wps=459808, ups=1.06, wpb=432998, bsz=16659, num_updates=19300, lr=0.000455251, gnorm=0.29, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=18989 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 853 / 1689 loss=4.3, nll_loss=2.694, ppl=6.47, wps=459367, ups=1.06, wpb=433510, bsz=16741.3, num_updates=19400, lr=0.000454077, gnorm=0.291, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=19083 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 953 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=459610, ups=1.05, wpb=437332, bsz=16717.8, num_updates=19500, lr=0.000452911, gnorm=0.282, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=19178 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1053 / 1689 loss=4.297, nll_loss=2.691, ppl=6.46, wps=460707, ups=1.06, wpb=433162, bsz=16085.3, num_updates=19600, lr=0.000451754, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=19272 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1153 / 1689 loss=4.303, nll_loss=2.697, ppl=6.49, wps=462693, ups=1.07, wpb=433695, bsz=16526.5, num_updates=19700, lr=0.000450606, gnorm=0.287, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=19366 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1254 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=454163, ups=1.05, wpb=434297, bsz=16673.3, num_updates=19800, lr=0.000449467, gnorm=0.272, clip=0, loss_scale=1, train_wall=94, gb_free=19.5, wall=19461 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1354 / 1689 loss=4.307, nll_loss=2.702, ppl=6.51, wps=462379, ups=1.06, wpb=435214, bsz=16876.2, num_updates=19900, lr=0.000448336, gnorm=0.27, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=19556 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 epoch 012: 1454 / 1689 loss=4.314, nll_loss=2.711, ppl=6.55, wps=460518, ups=1.06, wpb=435384, bsz=16294.6, num_updates=20000, lr=0.000447214, gnorm=0.282, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=19650 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012 | valid on 'valid' subset | loss 4.32 | nll_loss 2.676 | ppl 6.39 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.307 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1554 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=380841, ups=0.88, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.267, clip=0, loss_scale=1, train_wall=99, gb_free=18.9, wall=19764 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 epoch 012: 1654 / 1689 loss=4.308, nll_loss=2.704, ppl=6.51, wps=461158, ups=1.07, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.261, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=19858 end of epoch 12 (average epoch stats below) epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 epoch 012 | loss 4.302 | nll_loss 2.696 | ppl 6.48 | wps 449078 | ups 1.04 | wpb 433534 | bsz 16507 | num_updates 20235 | lr 0.000444609 | gnorm 0.278 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 20.7 | wall 19890 Start iterating over samples epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 65 / 1689 loss=4.279, nll_loss=2.671, ppl=6.37, wps=459762, ups=1.07, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.287, clip=0, loss_scale=2, train_wall=91, gb_free=18.9, wall=19951 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 166 / 1689 loss=4.272, nll_loss=2.662, ppl=6.33, wps=460284, ups=1.06, wpb=434308, bsz=16401.3, num_updates=20400, lr=0.000442807, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=20046 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 266 / 1689 loss=4.282, nll_loss=2.674, ppl=6.38, wps=464661, ups=1.07, wpb=435137, bsz=16357.1, num_updates=20500, lr=0.000441726, gnorm=0.275, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=20139 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 366 / 1689 loss=4.273, nll_loss=2.664, ppl=6.34, wps=461073, ups=1.06, wpb=433642, bsz=16743, num_updates=20600, lr=0.000440653, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=20234 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 466 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=461150, ups=1.07, wpb=431847, bsz=16645, num_updates=20700, lr=0.000439587, gnorm=0.276, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20327 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 566 / 1689 loss=4.29, nll_loss=2.683, ppl=6.42, wps=465340, ups=1.07, wpb=433950, bsz=16100.4, num_updates=20800, lr=0.000438529, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=20420 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 666 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465926, ups=1.07, wpb=435045, bsz=17056.2, num_updates=20900, lr=0.000437479, gnorm=0.273, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=20514 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 epoch 013: 767 / 1689 loss=4.287, nll_loss=2.68, ppl=6.41, wps=458890, ups=1.06, wpb=432546, bsz=16497, num_updates=21000, lr=0.000436436, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=20608 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013 | valid on 'valid' subset | loss 4.288 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.288 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 867 / 1689 loss=4.294, nll_loss=2.688, ppl=6.44, wps=384528, ups=0.88, wpb=435183, bsz=16687.9, num_updates=21100, lr=0.0004354, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=20721 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 967 / 1689 loss=4.288, nll_loss=2.681, ppl=6.41, wps=462508, ups=1.07, wpb=431940, bsz=16433, num_updates=21200, lr=0.000434372, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=20815 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1067 / 1689 loss=4.295, nll_loss=2.689, ppl=6.45, wps=469216, ups=1.08, wpb=435830, bsz=16370.2, num_updates=21300, lr=0.000433351, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=20907 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1167 / 1689 loss=4.283, nll_loss=2.675, ppl=6.39, wps=466179, ups=1.07, wpb=433858, bsz=16490.5, num_updates=21400, lr=0.000432338, gnorm=0.28, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=21001 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1267 / 1689 loss=4.291, nll_loss=2.685, ppl=6.43, wps=465664, ups=1.07, wpb=433430, bsz=16283.7, num_updates=21500, lr=0.000431331, gnorm=0.286, clip=0, loss_scale=2, train_wall=91, gb_free=19.2, wall=21094 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1367 / 1689 loss=4.299, nll_loss=2.695, ppl=6.47, wps=461235, ups=1.06, wpb=433300, bsz=16785.8, num_updates=21600, lr=0.000430331, gnorm=0.266, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=21188 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1467 / 1689 loss=4.29, nll_loss=2.684, ppl=6.43, wps=460806, ups=1.06, wpb=433226, bsz=16619.8, num_updates=21700, lr=0.000429339, gnorm=0.253, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=21282 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1567 / 1689 loss=4.291, nll_loss=2.686, ppl=6.43, wps=460833, ups=1.07, wpb=432672, bsz=16598, num_updates=21800, lr=0.000428353, gnorm=0.278, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=21376 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 epoch 013: 1667 / 1689 loss=4.284, nll_loss=2.678, ppl=6.4, wps=463296, ups=1.07, wpb=432982, bsz=16378, num_updates=21900, lr=0.000427374, gnorm=0.277, clip=0, loss_scale=2, train_wall=92, gb_free=20.9, wall=21469 end of epoch 13 (average epoch stats below) epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 epoch 013 | loss 4.286 | nll_loss 2.679 | ppl 6.4 | wps 457378 | ups 1.06 | wpb 433516 | bsz 16504.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.271 | clip 0 | loss_scale 2 | train_wall 1551 | gb_free 19.9 | wall 21489 Start iterating over samples epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 epoch 014: 79 / 1689 loss=4.263, nll_loss=2.653, ppl=6.29, wps=437778, ups=1.02, wpb=429258, bsz=16257.3, num_updates=22000, lr=0.000426401, gnorm=0.268, clip=0, loss_scale=2, train_wall=94, gb_free=19.4, wall=21567 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.285 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.285 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 180 / 1689 loss=4.271, nll_loss=2.662, ppl=6.33, wps=351374, ups=0.81, wpb=434517, bsz=16524.6, num_updates=22100, lr=0.000425436, gnorm=0.273, clip=0, loss_scale=1, train_wall=100, gb_free=18.7, wall=21691 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 280 / 1689 loss=4.261, nll_loss=2.651, ppl=6.28, wps=470182, ups=1.09, wpb=432551, bsz=16448.2, num_updates=22200, lr=0.000424476, gnorm=0.268, clip=0, loss_scale=1, train_wall=91, gb_free=19.9, wall=21783 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 380 / 1689 loss=4.267, nll_loss=2.658, ppl=6.31, wps=466376, ups=1.08, wpb=433102, bsz=16713.1, num_updates=22300, lr=0.000423524, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21876 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 480 / 1689 loss=4.267, nll_loss=2.657, ppl=6.31, wps=469414, ups=1.07, wpb=437504, bsz=16435, num_updates=22400, lr=0.000422577, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=21969 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 580 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=464094, ups=1.07, wpb=433706, bsz=16536.9, num_updates=22500, lr=0.000421637, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=22062 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 681 / 1689 loss=4.274, nll_loss=2.665, ppl=6.34, wps=457560, ups=1.06, wpb=432727, bsz=16394.9, num_updates=22600, lr=0.000420703, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=22157 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 781 / 1689 loss=4.269, nll_loss=2.66, ppl=6.32, wps=463300, ups=1.06, wpb=435986, bsz=16398.2, num_updates=22700, lr=0.000419775, gnorm=0.272, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=22251 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 881 / 1689 loss=4.276, nll_loss=2.668, ppl=6.36, wps=463623, ups=1.07, wpb=434863, bsz=16606.4, num_updates=22800, lr=0.000418854, gnorm=0.263, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22345 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 981 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=455457, ups=1.06, wpb=429150, bsz=16847.2, num_updates=22900, lr=0.000417938, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=22439 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 epoch 014: 1081 / 1689 loss=4.269, nll_loss=2.661, ppl=6.32, wps=460070, ups=1.06, wpb=435184, bsz=16629.4, num_updates=23000, lr=0.000417029, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=22534 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014 | valid on 'valid' subset | loss 4.292 | nll_loss 2.65 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.285 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1181 / 1689 loss=4.27, nll_loss=2.662, ppl=6.33, wps=404688, ups=0.94, wpb=431820, bsz=16491.8, num_updates=23100, lr=0.000416125, gnorm=0.267, clip=0, loss_scale=2, train_wall=92, gb_free=21.6, wall=22640 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1281 / 1689 loss=4.274, nll_loss=2.666, ppl=6.35, wps=461821, ups=1.06, wpb=434270, bsz=16358.7, num_updates=23200, lr=0.000415227, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=22734 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1381 / 1689 loss=4.292, nll_loss=2.687, ppl=6.44, wps=463815, ups=1.07, wpb=435037, bsz=16324.6, num_updates=23300, lr=0.000414335, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=22828 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1482 / 1689 loss=4.28, nll_loss=2.673, ppl=6.38, wps=458563, ups=1.05, wpb=434823, bsz=16492.2, num_updates=23400, lr=0.000413449, gnorm=0.267, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=22923 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1582 / 1689 loss=4.288, nll_loss=2.683, ppl=6.42, wps=462272, ups=1.07, wpb=433427, bsz=16634.5, num_updates=23500, lr=0.000412568, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23017 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 epoch 014: 1682 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=456265, ups=1.06, wpb=432169, bsz=16388.8, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=23111 end of epoch 14 (average epoch stats below) epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 epoch 014 | loss 4.272 | nll_loss 2.663 | ppl 6.34 | wps 448632 | ups 1.03 | wpb 433526 | bsz 16502.4 | num_updates 23607 | lr 0.000411632 | gnorm 0.264 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 21.7 | wall 23117 Start iterating over samples epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 93 / 1689 loss=4.238, nll_loss=2.625, ppl=6.17, wps=459466, ups=1.07, wpb=430642, bsz=16221.6, num_updates=23700, lr=0.000410824, gnorm=0.261, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=23205 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 193 / 1689 loss=4.245, nll_loss=2.632, ppl=6.2, wps=463653, ups=1.07, wpb=434433, bsz=16478.2, num_updates=23800, lr=0.00040996, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=23299 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 293 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=459569, ups=1.06, wpb=434394, bsz=16549.4, num_updates=23900, lr=0.000409101, gnorm=0.266, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23393 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 epoch 015: 393 / 1689 loss=4.266, nll_loss=2.657, ppl=6.31, wps=463618, ups=1.07, wpb=434459, bsz=16779.6, num_updates=24000, lr=0.000408248, gnorm=0.27, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=23487 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015 | valid on 'valid' subset | loss 4.283 | nll_loss 2.636 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.283 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 494 / 1689 loss=4.259, nll_loss=2.649, ppl=6.27, wps=383161, ups=0.88, wpb=435797, bsz=16786.7, num_updates=24100, lr=0.0004074, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=23601 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 594 / 1689 loss=4.25, nll_loss=2.639, ppl=6.23, wps=464804, ups=1.08, wpb=431776, bsz=16393.3, num_updates=24200, lr=0.000406558, gnorm=0.263, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=23694 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 694 / 1689 loss=4.257, nll_loss=2.647, ppl=6.26, wps=461382, ups=1.06, wpb=433900, bsz=16706.2, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23788 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 794 / 1689 loss=4.263, nll_loss=2.654, ppl=6.29, wps=461498, ups=1.07, wpb=431360, bsz=16278.7, num_updates=24400, lr=0.000404888, gnorm=0.269, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=23881 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 894 / 1689 loss=4.268, nll_loss=2.659, ppl=6.32, wps=462334, ups=1.06, wpb=434714, bsz=16201.2, num_updates=24500, lr=0.000404061, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=23975 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 994 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464862, ups=1.07, wpb=432844, bsz=16229.6, num_updates=24600, lr=0.000403239, gnorm=0.264, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=24068 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1094 / 1689 loss=4.264, nll_loss=2.655, ppl=6.3, wps=460622, ups=1.06, wpb=433534, bsz=16796.6, num_updates=24700, lr=0.000402422, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=17.8, wall=24162 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1194 / 1689 loss=4.271, nll_loss=2.663, ppl=6.33, wps=464448, ups=1.07, wpb=434156, bsz=16229.9, num_updates=24800, lr=0.00040161, gnorm=0.256, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=24256 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1294 / 1689 loss=4.262, nll_loss=2.653, ppl=6.29, wps=458260, ups=1.06, wpb=431700, bsz=16983.2, num_updates=24900, lr=0.000400802, gnorm=0.256, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=24350 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 epoch 015: 1394 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=464053, ups=1.07, wpb=432950, bsz=16535.2, num_updates=25000, lr=0.0004, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=24443 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015 | valid on 'valid' subset | loss 4.279 | nll_loss 2.635 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.279 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1495 / 1689 loss=4.265, nll_loss=2.657, ppl=6.31, wps=352831, ups=0.81, wpb=435134, bsz=16728.1, num_updates=25100, lr=0.000399202, gnorm=0.262, clip=0, loss_scale=2, train_wall=98, gb_free=18.5, wall=24567 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 epoch 015: 1595 / 1689 loss=4.265, nll_loss=2.656, ppl=6.3, wps=464656, ups=1.07, wpb=434902, bsz=16471.1, num_updates=25200, lr=0.00039841, gnorm=0.257, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24660 end of epoch 15 (average epoch stats below) epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 epoch 015 | loss 4.26 | nll_loss 2.65 | ppl 6.28 | wps 448224 | ups 1.03 | wpb 433558 | bsz 16504.4 | num_updates 25293 | lr 0.000397676 | gnorm 0.262 | clip 0 | loss_scale 1 | train_wall 1560 | gb_free 19.9 | wall 24748 Start iterating over samples epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 7 / 1689 loss=4.258, nll_loss=2.649, ppl=6.27, wps=454672, ups=1.06, wpb=430566, bsz=16073, num_updates=25300, lr=0.000397621, gnorm=0.273, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=24755 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 107 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=462575, ups=1.07, wpb=431619, bsz=16507, num_updates=25400, lr=0.000396838, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=24848 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 207 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=465407, ups=1.07, wpb=434464, bsz=16623, num_updates=25500, lr=0.000396059, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=24942 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 307 / 1689 loss=4.242, nll_loss=2.629, ppl=6.19, wps=464008, ups=1.07, wpb=435364, bsz=16255, num_updates=25600, lr=0.000395285, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=25036 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 407 / 1689 loss=4.244, nll_loss=2.632, ppl=6.2, wps=466529, ups=1.07, wpb=435355, bsz=16064.8, num_updates=25700, lr=0.000394515, gnorm=0.273, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=25129 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 507 / 1689 loss=4.248, nll_loss=2.636, ppl=6.22, wps=461517, ups=1.07, wpb=433298, bsz=16165.4, num_updates=25800, lr=0.00039375, gnorm=0.252, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=25223 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 607 / 1689 loss=4.254, nll_loss=2.643, ppl=6.25, wps=463746, ups=1.07, wpb=434192, bsz=16482.3, num_updates=25900, lr=0.000392989, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=18.5, wall=25316 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 epoch 016: 708 / 1689 loss=4.246, nll_loss=2.635, ppl=6.21, wps=459029, ups=1.06, wpb=433873, bsz=16525.5, num_updates=26000, lr=0.000392232, gnorm=0.264, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=25411 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016 | valid on 'valid' subset | loss 4.278 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.278 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 808 / 1689 loss=4.239, nll_loss=2.627, ppl=6.18, wps=367142, ups=0.85, wpb=432261, bsz=16603, num_updates=26100, lr=0.00039148, gnorm=0.269, clip=0, loss_scale=1, train_wall=97, gb_free=18.9, wall=25529 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 908 / 1689 loss=4.25, nll_loss=2.64, ppl=6.23, wps=465203, ups=1.07, wpb=434696, bsz=16726.2, num_updates=26200, lr=0.000390732, gnorm=0.256, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=25622 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1008 / 1689 loss=4.248, nll_loss=2.637, ppl=6.22, wps=470472, ups=1.08, wpb=434074, bsz=16340.1, num_updates=26300, lr=0.000389989, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=25714 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1108 / 1689 loss=4.255, nll_loss=2.645, ppl=6.26, wps=469324, ups=1.08, wpb=433551, bsz=16556.8, num_updates=26400, lr=0.000389249, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=25807 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1208 / 1689 loss=4.252, nll_loss=2.643, ppl=6.25, wps=461755, ups=1.06, wpb=434068, bsz=17102.3, num_updates=26500, lr=0.000388514, gnorm=0.253, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=25901 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1309 / 1689 loss=4.252, nll_loss=2.642, ppl=6.24, wps=459551, ups=1.06, wpb=432818, bsz=16727.5, num_updates=26600, lr=0.000387783, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=25995 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1409 / 1689 loss=4.243, nll_loss=2.632, ppl=6.2, wps=465341, ups=1.07, wpb=433399, bsz=16416.8, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26088 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1509 / 1689 loss=4.246, nll_loss=2.636, ppl=6.21, wps=464642, ups=1.07, wpb=432870, bsz=16571.8, num_updates=26800, lr=0.000386334, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=26181 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 epoch 016: 1609 / 1689 loss=4.26, nll_loss=2.652, ppl=6.28, wps=462216, ups=1.07, wpb=432708, bsz=16350.6, num_updates=26900, lr=0.000385615, gnorm=0.274, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=26275 end of epoch 16 (average epoch stats below) epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 epoch 016 | loss 4.248 | nll_loss 2.638 | ppl 6.22 | wps 456964 | ups 1.05 | wpb 433548 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.256 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 20.2 | wall 26348 Start iterating over samples epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 epoch 017: 20 / 1689 loss=4.26, nll_loss=2.651, ppl=6.28, wps=445967, ups=1.03, wpb=432374, bsz=16596.1, num_updates=27000, lr=0.0003849, gnorm=0.255, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26372 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017 | valid on 'valid' subset | loss 4.271 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.271 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 120 / 1689 loss=4.223, nll_loss=2.608, ppl=6.1, wps=385256, ups=0.88, wpb=436061, bsz=16652.4, num_updates=27100, lr=0.000384189, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=26485 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 221 / 1689 loss=4.226, nll_loss=2.612, ppl=6.11, wps=456307, ups=1.05, wpb=433342, bsz=16458.2, num_updates=27200, lr=0.000383482, gnorm=0.269, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26580 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 321 / 1689 loss=4.227, nll_loss=2.613, ppl=6.12, wps=463429, ups=1.07, wpb=431651, bsz=16518.2, num_updates=27300, lr=0.00038278, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=26673 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 421 / 1689 loss=4.243, nll_loss=2.631, ppl=6.19, wps=464926, ups=1.07, wpb=435204, bsz=16557.2, num_updates=27400, lr=0.00038208, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=26767 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 521 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=465333, ups=1.07, wpb=434552, bsz=16343.4, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=26860 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 621 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=461130, ups=1.06, wpb=433049, bsz=16215.5, num_updates=27600, lr=0.000380693, gnorm=0.27, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26954 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 721 / 1689 loss=4.233, nll_loss=2.62, ppl=6.15, wps=459705, ups=1.06, wpb=432448, bsz=16390.3, num_updates=27700, lr=0.000380006, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=27048 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 822 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=454599, ups=1.05, wpb=433212, bsz=17143, num_updates=27800, lr=0.000379322, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27143 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 922 / 1689 loss=4.254, nll_loss=2.645, ppl=6.25, wps=462709, ups=1.07, wpb=434094, bsz=16690.2, num_updates=27900, lr=0.000378641, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=27237 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 epoch 017: 1022 / 1689 loss=4.236, nll_loss=2.624, ppl=6.17, wps=460142, ups=1.06, wpb=433691, bsz=16295.8, num_updates=28000, lr=0.000377964, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=27331 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017 | valid on 'valid' subset | loss 4.27 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.27 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1122 / 1689 loss=4.247, nll_loss=2.637, ppl=6.22, wps=384120, ups=0.88, wpb=435453, bsz=16652.8, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=27445 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1222 / 1689 loss=4.241, nll_loss=2.629, ppl=6.19, wps=463811, ups=1.07, wpb=433944, bsz=16318.1, num_updates=28200, lr=0.000376622, gnorm=0.254, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=27538 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1323 / 1689 loss=4.244, nll_loss=2.633, ppl=6.2, wps=459007, ups=1.06, wpb=431569, bsz=16287.4, num_updates=28300, lr=0.000375956, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=27632 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1423 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=462184, ups=1.07, wpb=432822, bsz=16477.8, num_updates=28400, lr=0.000375293, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=27726 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1523 / 1689 loss=4.248, nll_loss=2.638, ppl=6.23, wps=457573, ups=1.06, wpb=432188, bsz=16742.1, num_updates=28500, lr=0.000374634, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27821 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 epoch 017: 1623 / 1689 loss=4.256, nll_loss=2.647, ppl=6.27, wps=463042, ups=1.06, wpb=435673, bsz=16729.7, num_updates=28600, lr=0.000373979, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=27915 end of epoch 17 (average epoch stats below) epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 epoch 017 | loss 4.238 | nll_loss 2.627 | ppl 6.18 | wps 449324 | ups 1.04 | wpb 433535 | bsz 16506.5 | num_updates 28666 | lr 0.000373548 | gnorm 0.253 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.9 | wall 27975 Start iterating over samples epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 34 / 1689 loss=4.231, nll_loss=2.618, ppl=6.14, wps=461875, ups=1.07, wpb=431355, bsz=16073, num_updates=28700, lr=0.000373327, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=28008 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 134 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=465452, ups=1.07, wpb=434710, bsz=16406.9, num_updates=28800, lr=0.000372678, gnorm=0.251, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=28101 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 235 / 1689 loss=4.228, nll_loss=2.615, ppl=6.13, wps=462323, ups=1.06, wpb=436445, bsz=16466.3, num_updates=28900, lr=0.000372033, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=28196 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 epoch 018: 335 / 1689 loss=4.233, nll_loss=2.621, ppl=6.15, wps=460340, ups=1.06, wpb=433151, bsz=16521, num_updates=29000, lr=0.000371391, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=28290 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018 | valid on 'valid' subset | loss 4.262 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.262 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 435 / 1689 loss=4.225, nll_loss=2.612, ppl=6.11, wps=352528, ups=0.81, wpb=433249, bsz=16676.8, num_updates=29100, lr=0.000370752, gnorm=0.26, clip=0, loss_scale=1, train_wall=98, gb_free=19.1, wall=28413 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 535 / 1689 loss=4.217, nll_loss=2.602, ppl=6.07, wps=464014, ups=1.07, wpb=431920, bsz=16561.1, num_updates=29200, lr=0.000370117, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28506 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 635 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=462581, ups=1.07, wpb=433715, bsz=16306.1, num_updates=29300, lr=0.000369484, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=28600 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 735 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=466023, ups=1.07, wpb=433859, bsz=16574.6, num_updates=29400, lr=0.000368856, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=28693 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 836 / 1689 loss=4.238, nll_loss=2.627, ppl=6.18, wps=456969, ups=1.05, wpb=434668, bsz=16850.8, num_updates=29500, lr=0.00036823, gnorm=0.262, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=28788 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 936 / 1689 loss=4.235, nll_loss=2.623, ppl=6.16, wps=464722, ups=1.07, wpb=433657, bsz=16304.9, num_updates=29600, lr=0.000367607, gnorm=0.26, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=28881 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1036 / 1689 loss=4.236, nll_loss=2.624, ppl=6.16, wps=464281, ups=1.07, wpb=434518, bsz=16259.4, num_updates=29700, lr=0.000366988, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=28975 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1136 / 1689 loss=4.24, nll_loss=2.629, ppl=6.19, wps=466620, ups=1.07, wpb=435204, bsz=16506.8, num_updates=29800, lr=0.000366372, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19, wall=29068 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1236 / 1689 loss=4.231, nll_loss=2.619, ppl=6.14, wps=461811, ups=1.07, wpb=432460, bsz=16613.3, num_updates=29900, lr=0.000365758, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=29162 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 epoch 018: 1337 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=464846, ups=1.07, wpb=435975, bsz=16310.5, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=29256 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018 | valid on 'valid' subset | loss 4.252 | nll_loss 2.607 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.252 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1437 / 1689 loss=4.232, nll_loss=2.62, ppl=6.15, wps=359979, ups=0.83, wpb=433980, bsz=16345.8, num_updates=30100, lr=0.000364541, gnorm=0.276, clip=0, loss_scale=1, train_wall=97, gb_free=20, wall=29376 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1537 / 1689 loss=4.222, nll_loss=2.609, ppl=6.1, wps=460792, ups=1.07, wpb=430890, bsz=16351.3, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29470 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 epoch 018: 1637 / 1689 loss=4.224, nll_loss=2.611, ppl=6.11, wps=461649, ups=1.07, wpb=430595, bsz=16986.3, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=29563 end of epoch 18 (average epoch stats below) epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 epoch 018 | loss 4.229 | nll_loss 2.616 | ppl 6.13 | wps 446823 | ups 1.03 | wpb 433506 | bsz 16506.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.252 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 21.2 | wall 29611 Start iterating over samples epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 48 / 1689 loss=4.23, nll_loss=2.617, ppl=6.14, wps=459647, ups=1.07, wpb=431501, bsz=16546.5, num_updates=30400, lr=0.000362738, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=29657 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 148 / 1689 loss=4.192, nll_loss=2.574, ppl=5.95, wps=459877, ups=1.06, wpb=431909, bsz=17014.2, num_updates=30500, lr=0.000362143, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=29751 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 249 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=456863, ups=1.06, wpb=432406, bsz=16463.4, num_updates=30600, lr=0.000361551, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=29845 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 349 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=459918, ups=1.06, wpb=433281, bsz=16605.1, num_updates=30700, lr=0.000360961, gnorm=0.264, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=29940 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 449 / 1689 loss=4.213, nll_loss=2.598, ppl=6.05, wps=464108, ups=1.07, wpb=433875, bsz=16229.8, num_updates=30800, lr=0.000360375, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=30033 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 549 / 1689 loss=4.237, nll_loss=2.625, ppl=6.17, wps=465372, ups=1.07, wpb=434485, bsz=16535.9, num_updates=30900, lr=0.000359791, gnorm=0.257, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=30126 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 epoch 019: 650 / 1689 loss=4.222, nll_loss=2.608, ppl=6.1, wps=458140, ups=1.06, wpb=433245, bsz=16957.8, num_updates=31000, lr=0.000359211, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=30221 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019 | valid on 'valid' subset | loss 4.257 | nll_loss 2.614 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.252 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 750 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=409496, ups=0.95, wpb=431821, bsz=16713.9, num_updates=31100, lr=0.000358633, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=30326 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 850 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=475968, ups=1.09, wpb=436166, bsz=16362.5, num_updates=31200, lr=0.000358057, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.8, wall=30418 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 950 / 1689 loss=4.217, nll_loss=2.603, ppl=6.08, wps=471264, ups=1.09, wpb=434038, bsz=16318.3, num_updates=31300, lr=0.000357485, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.4, wall=30510 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1050 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=460774, ups=1.07, wpb=432582, bsz=16340, num_updates=31400, lr=0.000356915, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=30604 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1150 / 1689 loss=4.229, nll_loss=2.617, ppl=6.13, wps=471407, ups=1.08, wpb=435612, bsz=16340.4, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=30696 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1250 / 1689 loss=4.234, nll_loss=2.623, ppl=6.16, wps=462656, ups=1.07, wpb=434192, bsz=16718, num_updates=31600, lr=0.000355784, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=17.8, wall=30790 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1350 / 1689 loss=4.234, nll_loss=2.622, ppl=6.16, wps=462954, ups=1.06, wpb=435559, bsz=16704.2, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=30884 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1450 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=462614, ups=1.07, wpb=431717, bsz=16159, num_updates=31800, lr=0.000354663, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30978 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1550 / 1689 loss=4.229, nll_loss=2.617, ppl=6.14, wps=466061, ups=1.07, wpb=434459, bsz=16247.1, num_updates=31900, lr=0.000354107, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=31071 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 epoch 019: 1650 / 1689 loss=4.231, nll_loss=2.62, ppl=6.15, wps=465218, ups=1.07, wpb=433355, bsz=16678.7, num_updates=32000, lr=0.000353553, gnorm=0.236, clip=0, loss_scale=2, train_wall=91, gb_free=19.1, wall=31164 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 epoch 019 | valid on 'valid' subset | loss 4.246 | nll_loss 2.603 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.246 end of epoch 19 (average epoch stats below) epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 epoch 019 | loss 4.221 | nll_loss 2.607 | ppl 6.09 | wps 448450 | ups 1.03 | wpb 433522 | bsz 16503.9 | num_updates 32039 | lr 0.000353338 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1556 | gb_free 20.4 | wall 31242 Start iterating over samples epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 62 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=316704, ups=0.74, wpb=429481, bsz=16198.4, num_updates=32100, lr=0.000353002, gnorm=0.255, clip=0, loss_scale=1, train_wall=95, gb_free=20.2, wall=31300 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 162 / 1689 loss=4.203, nll_loss=2.587, ppl=6.01, wps=471988, ups=1.09, wpb=433320, bsz=16206, num_updates=32200, lr=0.000352454, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31391 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 262 / 1689 loss=4.205, nll_loss=2.589, ppl=6.02, wps=469571, ups=1.08, wpb=433434, bsz=16355.5, num_updates=32300, lr=0.000351908, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=20.8, wall=31484 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 362 / 1689 loss=4.198, nll_loss=2.581, ppl=5.99, wps=463893, ups=1.07, wpb=433149, bsz=16657.8, num_updates=32400, lr=0.000351364, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=31577 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 462 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=464210, ups=1.07, wpb=433570, bsz=16796.1, num_updates=32500, lr=0.000350823, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=31671 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 562 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=467928, ups=1.07, wpb=435715, bsz=16777.7, num_updates=32600, lr=0.000350285, gnorm=0.248, clip=0, loss_scale=2, train_wall=92, gb_free=19.9, wall=31764 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 663 / 1689 loss=4.216, nll_loss=2.602, ppl=6.07, wps=458404, ups=1.05, wpb=434806, bsz=16406.2, num_updates=32700, lr=0.000349749, gnorm=0.263, clip=0, loss_scale=1, train_wall=94, gb_free=18, wall=31859 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 763 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=462561, ups=1.07, wpb=432927, bsz=16947.9, num_updates=32800, lr=0.000349215, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=21.5, wall=31952 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 863 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=460327, ups=1.06, wpb=433388, bsz=16471.8, num_updates=32900, lr=0.000348684, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=32046 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 epoch 020: 963 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=461274, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=32140 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.246 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1063 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=389234, ups=0.9, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.244, clip=0, loss_scale=1, train_wall=96, gb_free=18.6, wall=32251 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1163 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463260, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.254, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=32345 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1263 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461736, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=32439 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1363 / 1689 loss=4.219, nll_loss=2.606, ppl=6.09, wps=461891, ups=1.07, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=18.7, wall=32533 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1463 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=465542, ups=1.08, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.247, clip=0, loss_scale=2, train_wall=91, gb_free=19.5, wall=32626 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1563 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=461453, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.241, clip=0, loss_scale=2, train_wall=92, gb_free=19.8, wall=32720 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 epoch 020: 1664 / 1689 loss=4.224, nll_loss=2.612, ppl=6.11, wps=457742, ups=1.05, wpb=436402, bsz=16568.4, num_updates=33700, lr=0.00034452, gnorm=0.251, clip=0, loss_scale=2, train_wall=94, gb_free=19.7, wall=32815 end of epoch 20 (average epoch stats below) epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 epoch 020 | loss 4.213 | nll_loss 2.599 | ppl 6.06 | wps 457989 | ups 1.06 | wpb 433528 | bsz 16499.5 | num_updates 33725 | lr 0.000344393 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 1558 | gb_free 20 | wall 32838 Start iterating over samples epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 76 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=456616, ups=1.07, wpb=428677, bsz=16288.2, num_updates=33800, lr=0.00034401, gnorm=0.253, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=32909 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 176 / 1689 loss=4.195, nll_loss=2.578, ppl=5.97, wps=463501, ups=1.07, wpb=433163, bsz=16500.3, num_updates=33900, lr=0.000343503, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=33002 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 epoch 021: 276 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=463952, ups=1.07, wpb=432644, bsz=16608.2, num_updates=34000, lr=0.000342997, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33096 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.24 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.24 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 376 / 1689 loss=4.2, nll_loss=2.584, ppl=6, wps=384416, ups=0.89, wpb=432906, bsz=16776.3, num_updates=34100, lr=0.000342494, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33208 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 476 / 1689 loss=4.206, nll_loss=2.591, ppl=6.02, wps=459964, ups=1.07, wpb=431692, bsz=16654.4, num_updates=34200, lr=0.000341993, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=33302 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 576 / 1689 loss=4.216, nll_loss=2.603, ppl=6.07, wps=468323, ups=1.07, wpb=437780, bsz=16499.4, num_updates=34300, lr=0.000341494, gnorm=0.232, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=33396 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 677 / 1689 loss=4.205, nll_loss=2.59, ppl=6.02, wps=460726, ups=1.06, wpb=432781, bsz=16408.7, num_updates=34400, lr=0.000340997, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=33490 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 777 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=458996, ups=1.06, wpb=433224, bsz=16614.5, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33584 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 877 / 1689 loss=4.199, nll_loss=2.583, ppl=5.99, wps=461256, ups=1.07, wpb=432913, bsz=16305.4, num_updates=34600, lr=0.00034001, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=33678 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 977 / 1689 loss=4.207, nll_loss=2.592, ppl=6.03, wps=462373, ups=1.07, wpb=433286, bsz=16327.8, num_updates=34700, lr=0.00033952, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=33772 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1077 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=464303, ups=1.06, wpb=436146, bsz=16429.8, num_updates=34800, lr=0.000339032, gnorm=0.25, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=33865 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1177 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=463614, ups=1.07, wpb=433542, bsz=16793.7, num_updates=34900, lr=0.000338546, gnorm=0.243, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=33959 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 epoch 021: 1278 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=458811, ups=1.05, wpb=435755, bsz=16577.9, num_updates=35000, lr=0.000338062, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=34054 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021 | valid on 'valid' subset | loss 4.243 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.24 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1378 / 1689 loss=4.212, nll_loss=2.598, ppl=6.05, wps=389646, ups=0.9, wpb=433707, bsz=16114.3, num_updates=35100, lr=0.00033758, gnorm=0.239, clip=0, loss_scale=1, train_wall=97, gb_free=17.6, wall=34165 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1478 / 1689 loss=4.215, nll_loss=2.602, ppl=6.07, wps=463611, ups=1.07, wpb=433104, bsz=16385.6, num_updates=35200, lr=0.0003371, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=34259 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1578 / 1689 loss=4.203, nll_loss=2.588, ppl=6.01, wps=469752, ups=1.08, wpb=434080, bsz=16485.2, num_updates=35300, lr=0.000336622, gnorm=0.25, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=34351 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 epoch 021: 1678 / 1689 loss=4.206, nll_loss=2.592, ppl=6.03, wps=468803, ups=1.08, wpb=433998, bsz=16612.2, num_updates=35400, lr=0.000336146, gnorm=0.26, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=34444 end of epoch 21 (average epoch stats below) epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 epoch 021 | loss 4.205 | nll_loss 2.591 | ppl 6.02 | wps 452456 | ups 1.04 | wpb 433536 | bsz 16502.5 | num_updates 35411 | lr 0.000336094 | gnorm 0.245 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.2 | wall 34453 Start iterating over samples epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 89 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=461427, ups=1.07, wpb=430721, bsz=16343.4, num_updates=35500, lr=0.000335673, gnorm=0.246, clip=0, loss_scale=2, train_wall=91, gb_free=19.4, wall=34537 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 190 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=460294, ups=1.07, wpb=431505, bsz=16360.8, num_updates=35600, lr=0.000335201, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=34631 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 290 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=470808, ups=1.09, wpb=432984, bsz=16550.6, num_updates=35700, lr=0.000334731, gnorm=0.24, clip=0, loss_scale=1, train_wall=91, gb_free=19.6, wall=34723 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 390 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=465306, ups=1.07, wpb=434338, bsz=16600.6, num_updates=35800, lr=0.000334263, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=34816 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 490 / 1689 loss=4.197, nll_loss=2.581, ppl=5.98, wps=464230, ups=1.07, wpb=434091, bsz=16601, num_updates=35900, lr=0.000333797, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=34910 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 epoch 022: 590 / 1689 loss=4.194, nll_loss=2.577, ppl=5.97, wps=465851, ups=1.07, wpb=434301, bsz=16612.5, num_updates=36000, lr=0.000333333, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35003 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.252 | nll_loss 2.613 | ppl 6.12 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.24 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 690 / 1689 loss=4.191, nll_loss=2.574, ppl=5.96, wps=409472, ups=0.94, wpb=435142, bsz=16517.9, num_updates=36100, lr=0.000332871, gnorm=0.247, clip=0, loss_scale=2, train_wall=92, gb_free=19.1, wall=35109 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 791 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=463667, ups=1.07, wpb=433145, bsz=16175, num_updates=36200, lr=0.000332411, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35203 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 891 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=465076, ups=1.07, wpb=434634, bsz=16812.3, num_updates=36300, lr=0.000331953, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=21.3, wall=35296 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 991 / 1689 loss=4.204, nll_loss=2.589, ppl=6.02, wps=468029, ups=1.07, wpb=435591, bsz=16464.6, num_updates=36400, lr=0.000331497, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35389 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1091 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=465341, ups=1.07, wpb=435481, bsz=16571.8, num_updates=36500, lr=0.000331042, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35483 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1191 / 1689 loss=4.211, nll_loss=2.597, ppl=6.05, wps=466629, ups=1.08, wpb=433748, bsz=16406.6, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=35576 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1292 / 1689 loss=4.197, nll_loss=2.582, ppl=5.99, wps=457053, ups=1.06, wpb=431755, bsz=16547.1, num_updates=36700, lr=0.000330139, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=35670 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1392 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=465570, ups=1.07, wpb=434817, bsz=16764.6, num_updates=36800, lr=0.00032969, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=35763 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1492 / 1689 loss=4.201, nll_loss=2.586, ppl=6.01, wps=465112, ups=1.07, wpb=433608, bsz=16323.2, num_updates=36900, lr=0.000329243, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=35857 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 epoch 022: 1592 / 1689 loss=4.217, nll_loss=2.604, ppl=6.08, wps=464238, ups=1.07, wpb=433010, bsz=16530.7, num_updates=37000, lr=0.000328798, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=35950 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 epoch 022 | valid on 'valid' subset | loss 4.25 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.24 end of epoch 22 (average epoch stats below) epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 epoch 022 | loss 4.199 | nll_loss 2.583 | ppl 5.99 | wps 452480 | ups 1.04 | wpb 433538 | bsz 16502.9 | num_updates 37097 | lr 0.000328368 | gnorm 0.242 | clip 0 | loss_scale 1 | train_wall 1564 | gb_free 21 | wall 36069 Start iterating over samples epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 3 / 1689 loss=4.201, nll_loss=2.586, ppl=6, wps=350681, ups=0.82, wpb=428418, bsz=16373.2, num_updates=37100, lr=0.000328355, gnorm=0.239, clip=0, loss_scale=1, train_wall=104, gb_free=18.8, wall=36072 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 103 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=465595, ups=1.07, wpb=434698, bsz=16628.2, num_updates=37200, lr=0.000327913, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=36166 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 204 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=459953, ups=1.06, wpb=434157, bsz=17069.1, num_updates=37300, lr=0.000327473, gnorm=0.243, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=36260 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 304 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=462894, ups=1.07, wpb=434509, bsz=16751.4, num_updates=37400, lr=0.000327035, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=36354 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 404 / 1689 loss=4.196, nll_loss=2.58, ppl=5.98, wps=463172, ups=1.07, wpb=434188, bsz=16747, num_updates=37500, lr=0.000326599, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=36448 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 504 / 1689 loss=4.184, nll_loss=2.566, ppl=5.92, wps=459657, ups=1.06, wpb=434080, bsz=16716.4, num_updates=37600, lr=0.000326164, gnorm=0.254, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=36542 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 604 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=465612, ups=1.07, wpb=433922, bsz=16182.7, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36635 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 704 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=457353, ups=1.06, wpb=431646, bsz=16541.7, num_updates=37800, lr=0.0003253, gnorm=0.258, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=36730 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 805 / 1689 loss=4.186, nll_loss=2.569, ppl=5.93, wps=456185, ups=1.06, wpb=432144, bsz=16585.2, num_updates=37900, lr=0.000324871, gnorm=0.246, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=36824 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 epoch 023: 905 / 1689 loss=4.202, nll_loss=2.587, ppl=6.01, wps=465383, ups=1.07, wpb=434942, bsz=16746.6, num_updates=38000, lr=0.000324443, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=36918 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023 | valid on 'valid' subset | loss 4.243 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.24 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1005 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=388603, ups=0.9, wpb=431647, bsz=16161.5, num_updates=38100, lr=0.000324017, gnorm=0.251, clip=0, loss_scale=1, train_wall=96, gb_free=19.9, wall=37029 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1105 / 1689 loss=4.207, nll_loss=2.593, ppl=6.04, wps=463733, ups=1.07, wpb=435194, bsz=16270.2, num_updates=38200, lr=0.000323592, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=37123 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1205 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=465941, ups=1.07, wpb=433533, bsz=16194.6, num_updates=38300, lr=0.00032317, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37216 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1306 / 1689 loss=4.203, nll_loss=2.589, ppl=6.02, wps=461232, ups=1.06, wpb=435577, bsz=16496.6, num_updates=38400, lr=0.000322749, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=37310 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1406 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=460442, ups=1.06, wpb=433238, bsz=16557, num_updates=38500, lr=0.000322329, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=37404 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1506 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=462606, ups=1.07, wpb=433627, bsz=16515.6, num_updates=38600, lr=0.000321911, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=37498 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 epoch 023: 1606 / 1689 loss=4.193, nll_loss=2.577, ppl=5.97, wps=463954, ups=1.07, wpb=432896, bsz=16382.6, num_updates=38700, lr=0.000321495, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=37591 end of epoch 23 (average epoch stats below) epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 epoch 023 | loss 4.192 | nll_loss 2.576 | ppl 5.96 | wps 456994 | ups 1.05 | wpb 433529 | bsz 16508.6 | num_updates 38783 | lr 0.000321151 | gnorm 0.243 | clip 0 | loss_scale 1 | train_wall 1559 | gb_free 22.2 | wall 37668 Start iterating over samples epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 17 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=433673, ups=1.01, wpb=429857, bsz=16080.9, num_updates=38800, lr=0.000321081, gnorm=0.237, clip=0, loss_scale=1, train_wall=95, gb_free=18.3, wall=37690 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 118 / 1689 loss=4.179, nll_loss=2.56, ppl=5.9, wps=457650, ups=1.05, wpb=433926, bsz=16738.9, num_updates=38900, lr=0.000320668, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=37785 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 epoch 024: 218 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=463292, ups=1.07, wpb=431368, bsz=16801.8, num_updates=39000, lr=0.000320256, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=37878 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.24 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 318 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=402240, ups=0.93, wpb=433635, bsz=16143.8, num_updates=39100, lr=0.000319847, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=37986 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 418 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=466041, ups=1.07, wpb=435456, bsz=16547.7, num_updates=39200, lr=0.000319438, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=38080 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 518 / 1689 loss=4.182, nll_loss=2.564, ppl=5.91, wps=464692, ups=1.07, wpb=434146, bsz=16599.8, num_updates=39300, lr=0.000319032, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=38173 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 618 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468565, ups=1.08, wpb=434970, bsz=16418.2, num_updates=39400, lr=0.000318626, gnorm=0.245, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=38266 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 719 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=461559, ups=1.06, wpb=434675, bsz=16143.1, num_updates=39500, lr=0.000318223, gnorm=0.247, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=38360 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 819 / 1689 loss=4.189, nll_loss=2.572, ppl=5.95, wps=468296, ups=1.08, wpb=432654, bsz=16315.8, num_updates=39600, lr=0.000317821, gnorm=0.249, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=38452 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 919 / 1689 loss=4.195, nll_loss=2.58, ppl=5.98, wps=467212, ups=1.08, wpb=434412, bsz=16652.8, num_updates=39700, lr=0.00031742, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=38545 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1019 / 1689 loss=4.192, nll_loss=2.576, ppl=5.96, wps=469116, ups=1.08, wpb=433262, bsz=16511.3, num_updates=39800, lr=0.000317021, gnorm=0.256, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=38638 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1119 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=465870, ups=1.07, wpb=433960, bsz=17153.7, num_updates=39900, lr=0.000316624, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=38731 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 epoch 024: 1219 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=464444, ups=1.07, wpb=434627, bsz=16603, num_updates=40000, lr=0.000316228, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=38824 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024 | valid on 'valid' subset | loss 4.246 | nll_loss 2.605 | ppl 6.08 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.24 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1320 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=335864, ups=0.78, wpb=431724, bsz=16480, num_updates=40100, lr=0.000315833, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=38953 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1420 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=469803, ups=1.08, wpb=433868, bsz=16204.6, num_updates=40200, lr=0.00031544, gnorm=0.232, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=39045 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1520 / 1689 loss=4.196, nll_loss=2.581, ppl=5.98, wps=469908, ups=1.08, wpb=433180, bsz=16403, num_updates=40300, lr=0.000315049, gnorm=0.232, clip=0, loss_scale=1, train_wall=91, gb_free=18.7, wall=39138 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 epoch 024: 1620 / 1689 loss=4.199, nll_loss=2.584, ppl=6, wps=468532, ups=1.08, wpb=433684, bsz=16583, num_updates=40400, lr=0.000314658, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=39230 end of epoch 24 (average epoch stats below) epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 epoch 024 | loss 4.187 | nll_loss 2.57 | ppl 5.94 | wps 449593 | ups 1.04 | wpb 433527 | bsz 16504.1 | num_updates 40469 | lr 0.00031439 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1554 | gb_free 19.8 | wall 39294 Start iterating over samples epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 31 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=462908, ups=1.07, wpb=431836, bsz=16222.1, num_updates=40500, lr=0.00031427, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=39323 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 131 / 1689 loss=4.162, nll_loss=2.541, ppl=5.82, wps=462147, ups=1.07, wpb=431864, bsz=16554.2, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=39417 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 232 / 1689 loss=4.171, nll_loss=2.552, ppl=5.86, wps=460677, ups=1.06, wpb=435296, bsz=16475, num_updates=40700, lr=0.000313497, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39511 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 332 / 1689 loss=4.18, nll_loss=2.562, ppl=5.9, wps=462964, ups=1.06, wpb=435509, bsz=16619.8, num_updates=40800, lr=0.000313112, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39605 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 432 / 1689 loss=4.177, nll_loss=2.558, ppl=5.89, wps=459780, ups=1.07, wpb=431051, bsz=16641.2, num_updates=40900, lr=0.000312729, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=39699 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 epoch 025: 532 / 1689 loss=4.174, nll_loss=2.556, ppl=5.88, wps=459068, ups=1.06, wpb=432480, bsz=16643, num_updates=41000, lr=0.000312348, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=39793 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.251 | nll_loss 2.611 | ppl 6.11 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.24 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 632 / 1689 loss=4.187, nll_loss=2.571, ppl=5.94, wps=381471, ups=0.88, wpb=434284, bsz=16285.8, num_updates=41100, lr=0.000311967, gnorm=0.25, clip=0, loss_scale=1, train_wall=99, gb_free=20.4, wall=39907 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 733 / 1689 loss=4.18, nll_loss=2.562, ppl=5.91, wps=458937, ups=1.06, wpb=433808, bsz=16692, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40002 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 833 / 1689 loss=4.179, nll_loss=2.561, ppl=5.9, wps=462091, ups=1.07, wpb=433014, bsz=16475.2, num_updates=41300, lr=0.000311211, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=40095 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 933 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=462640, ups=1.07, wpb=433626, bsz=16809.8, num_updates=41400, lr=0.000310835, gnorm=0.261, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=40189 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1033 / 1689 loss=4.191, nll_loss=2.575, ppl=5.96, wps=461628, ups=1.06, wpb=434563, bsz=16307, num_updates=41500, lr=0.00031046, gnorm=0.236, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=40283 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1133 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=463738, ups=1.07, wpb=433022, bsz=16158.2, num_updates=41600, lr=0.000310087, gnorm=0.252, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=40377 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1233 / 1689 loss=4.183, nll_loss=2.567, ppl=5.92, wps=467086, ups=1.07, wpb=434870, bsz=16393.3, num_updates=41700, lr=0.000309715, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=40470 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1334 / 1689 loss=4.19, nll_loss=2.574, ppl=5.95, wps=463208, ups=1.06, wpb=435896, bsz=16693.9, num_updates=41800, lr=0.000309344, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=21.1, wall=40564 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1434 / 1689 loss=4.193, nll_loss=2.578, ppl=5.97, wps=465405, ups=1.07, wpb=434410, bsz=16421.7, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=40657 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 epoch 025: 1534 / 1689 loss=4.19, nll_loss=2.575, ppl=5.96, wps=461933, ups=1.06, wpb=433897, bsz=16680.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=40751 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025 | valid on 'valid' subset | loss 4.242 | nll_loss 2.601 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.24 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 epoch 025: 1634 / 1689 loss=4.18, nll_loss=2.563, ppl=5.91, wps=372730, ups=0.86, wpb=431638, bsz=16418.5, num_updates=42100, lr=0.00030824, gnorm=0.225, clip=0, loss_scale=1, train_wall=101, gb_free=19.2, wall=40867 end of epoch 25 (average epoch stats below) epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 epoch 025 | loss 4.181 | nll_loss 2.564 | ppl 5.91 | wps 450171 | ups 1.04 | wpb 433559 | bsz 16505.1 | num_updates 42155 | lr 0.000308039 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 19.3 | wall 40918 Start iterating over samples epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 45 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=454134, ups=1.05, wpb=431764, bsz=16529.6, num_updates=42200, lr=0.000307875, gnorm=0.242, clip=0, loss_scale=1, train_wall=91, gb_free=19.8, wall=40962 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 146 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=462924, ups=1.06, wpb=434783, bsz=16348.6, num_updates=42300, lr=0.00030751, gnorm=0.241, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41056 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 246 / 1689 loss=4.173, nll_loss=2.554, ppl=5.87, wps=464346, ups=1.07, wpb=434195, bsz=16906.5, num_updates=42400, lr=0.000307148, gnorm=0.237, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=41150 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 346 / 1689 loss=4.176, nll_loss=2.558, ppl=5.89, wps=467921, ups=1.07, wpb=435400, bsz=16584.5, num_updates=42500, lr=0.000306786, gnorm=0.247, clip=0, loss_scale=1, train_wall=91, gb_free=19.3, wall=41243 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 446 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=462965, ups=1.07, wpb=433402, bsz=16215.5, num_updates=42600, lr=0.000306426, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=41336 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 546 / 1689 loss=4.172, nll_loss=2.553, ppl=5.87, wps=457352, ups=1.06, wpb=431700, bsz=16695.3, num_updates=42700, lr=0.000306067, gnorm=0.255, clip=0, loss_scale=1, train_wall=92, gb_free=20.5, wall=41431 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 647 / 1689 loss=4.165, nll_loss=2.545, ppl=5.84, wps=455848, ups=1.05, wpb=433804, bsz=16283.2, num_updates=42800, lr=0.000305709, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=41526 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 747 / 1689 loss=4.184, nll_loss=2.567, ppl=5.92, wps=459163, ups=1.05, wpb=435421, bsz=16533.8, num_updates=42900, lr=0.000305352, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=41621 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 epoch 026: 847 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=464340, ups=1.07, wpb=432578, bsz=16502.1, num_updates=43000, lr=0.000304997, gnorm=0.249, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=41714 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026 | valid on 'valid' subset | loss 4.241 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.24 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 947 / 1689 loss=4.192, nll_loss=2.577, ppl=5.97, wps=386248, ups=0.89, wpb=435839, bsz=16457.6, num_updates=43100, lr=0.000304643, gnorm=0.239, clip=0, loss_scale=1, train_wall=95, gb_free=18.9, wall=41827 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1047 / 1689 loss=4.167, nll_loss=2.548, ppl=5.85, wps=462330, ups=1.07, wpb=430785, bsz=16349.7, num_updates=43200, lr=0.00030429, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=41920 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1147 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=463594, ups=1.07, wpb=432708, bsz=16978.7, num_updates=43300, lr=0.000303939, gnorm=0.247, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42013 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1249 / 1689 loss=4.178, nll_loss=2.56, ppl=5.9, wps=453894, ups=1.05, wpb=433334, bsz=16174.4, num_updates=43400, lr=0.000303588, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=42109 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1349 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=464186, ups=1.07, wpb=432212, bsz=16425, num_updates=43500, lr=0.000303239, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=42202 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1449 / 1689 loss=4.181, nll_loss=2.565, ppl=5.92, wps=462542, ups=1.06, wpb=435496, bsz=16790.6, num_updates=43600, lr=0.000302891, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=42296 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1549 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=464531, ups=1.07, wpb=432930, bsz=16359.8, num_updates=43700, lr=0.000302545, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=42389 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 epoch 026: 1649 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=459880, ups=1.06, wpb=433108, bsz=16416.3, num_updates=43800, lr=0.000302199, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=42483 end of epoch 26 (average epoch stats below) epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 epoch 026 | loss 4.176 | nll_loss 2.558 | ppl 5.89 | wps 455723 | ups 1.05 | wpb 433527 | bsz 16501.1 | num_updates 43840 | lr 0.000302061 | gnorm 0.24 | clip 0 | loss_scale 1 | train_wall 1558 | gb_free 19.2 | wall 42521 Start iterating over samples epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 61 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=453356, ups=1.05, wpb=430464, bsz=16069.1, num_updates=43900, lr=0.000301855, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=42578 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 epoch 027: 161 / 1689 loss=4.166, nll_loss=2.546, ppl=5.84, wps=464330, ups=1.07, wpb=434045, bsz=16643.2, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=42672 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.602 | ppl 6.07 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.24 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 261 / 1689 loss=4.156, nll_loss=2.535, ppl=5.79, wps=410525, ups=0.95, wpb=434001, bsz=16130.3, num_updates=44100, lr=0.000301169, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=42777 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 361 / 1689 loss=4.162, nll_loss=2.542, ppl=5.82, wps=467954, ups=1.08, wpb=434786, bsz=16419.8, num_updates=44200, lr=0.000300828, gnorm=0.246, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42870 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 461 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=467673, ups=1.08, wpb=431744, bsz=16399.2, num_updates=44300, lr=0.000300489, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=42963 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 561 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=466944, ups=1.08, wpb=433536, bsz=16416.7, num_updates=44400, lr=0.00030015, gnorm=0.227, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43055 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 661 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=468247, ups=1.08, wpb=434869, bsz=16365, num_updates=44500, lr=0.000299813, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=43148 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 761 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=466614, ups=1.07, wpb=434750, bsz=16662, num_updates=44600, lr=0.000299476, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43242 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 861 / 1689 loss=4.184, nll_loss=2.568, ppl=5.93, wps=468956, ups=1.08, wpb=435706, bsz=16737, num_updates=44700, lr=0.000299141, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=20.3, wall=43334 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 961 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=461916, ups=1.07, wpb=431604, bsz=16370.3, num_updates=44800, lr=0.000298807, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=43428 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1061 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=463276, ups=1.07, wpb=433938, bsz=16812.7, num_updates=44900, lr=0.000298474, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.2, wall=43522 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 epoch 027: 1162 / 1689 loss=4.182, nll_loss=2.566, ppl=5.92, wps=460136, ups=1.06, wpb=434304, bsz=16902.6, num_updates=45000, lr=0.000298142, gnorm=0.252, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=43616 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027 | valid on 'valid' subset | loss 4.237 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.237 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1262 / 1689 loss=4.172, nll_loss=2.554, ppl=5.87, wps=367289, ups=0.84, wpb=436279, bsz=16465.8, num_updates=45100, lr=0.000297812, gnorm=0.227, clip=0, loss_scale=1, train_wall=97, gb_free=18.6, wall=43735 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1362 / 1689 loss=4.17, nll_loss=2.551, ppl=5.86, wps=464253, ups=1.07, wpb=432685, bsz=16449, num_updates=45200, lr=0.000297482, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=20.1, wall=43828 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1462 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=464355, ups=1.07, wpb=432167, bsz=16491, num_updates=45300, lr=0.000297154, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=43921 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1562 / 1689 loss=4.176, nll_loss=2.559, ppl=5.89, wps=463083, ups=1.07, wpb=432238, bsz=16438.8, num_updates=45400, lr=0.000296826, gnorm=0.241, clip=0, loss_scale=1, train_wall=91, gb_free=19.7, wall=44014 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 epoch 027: 1663 / 1689 loss=4.188, nll_loss=2.572, ppl=5.95, wps=459743, ups=1.06, wpb=433560, bsz=16753.1, num_updates=45500, lr=0.0002965, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=44109 end of epoch 27 (average epoch stats below) epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 epoch 027 | loss 4.171 | nll_loss 2.552 | ppl 5.87 | wps 453551 | ups 1.05 | wpb 433556 | bsz 16507.1 | num_updates 45526 | lr 0.000296415 | gnorm 0.241 | clip 0 | loss_scale 1 | train_wall 1556 | gb_free 20.6 | wall 44132 Start iterating over samples epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 74 / 1689 loss=4.162, nll_loss=2.543, ppl=5.83, wps=460974, ups=1.07, wpb=430923, bsz=16379, num_updates=45600, lr=0.000296174, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=17.9, wall=44202 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 174 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=464994, ups=1.07, wpb=433524, bsz=16631.8, num_updates=45700, lr=0.00029585, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=44295 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 274 / 1689 loss=4.163, nll_loss=2.544, ppl=5.83, wps=464358, ups=1.07, wpb=433701, bsz=16375.4, num_updates=45800, lr=0.000295527, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=44389 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 374 / 1689 loss=4.168, nll_loss=2.549, ppl=5.85, wps=465074, ups=1.07, wpb=433448, bsz=16623.4, num_updates=45900, lr=0.000295205, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=44482 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 epoch 028: 474 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=461359, ups=1.06, wpb=433392, bsz=16715.1, num_updates=46000, lr=0.000294884, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=18.4, wall=44576 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028 | valid on 'valid' subset | loss 4.252 | nll_loss 2.609 | ppl 6.1 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.237 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 575 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=406555, ups=0.94, wpb=433449, bsz=16920.2, num_updates=46100, lr=0.000294564, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=44683 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 675 / 1689 loss=4.161, nll_loss=2.541, ppl=5.82, wps=467969, ups=1.08, wpb=434993, bsz=16463.1, num_updates=46200, lr=0.000294245, gnorm=0.236, clip=0, loss_scale=1, train_wall=91, gb_free=18.6, wall=44776 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 775 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=464486, ups=1.07, wpb=432410, bsz=16449.1, num_updates=46300, lr=0.000293927, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=44869 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 875 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=460047, ups=1.07, wpb=431558, bsz=16374.8, num_updates=46400, lr=0.00029361, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=44962 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 975 / 1689 loss=4.178, nll_loss=2.561, ppl=5.9, wps=463847, ups=1.07, wpb=434585, bsz=16437.5, num_updates=46500, lr=0.000293294, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=45056 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1075 / 1689 loss=4.177, nll_loss=2.559, ppl=5.9, wps=465190, ups=1.07, wpb=433164, bsz=16147, num_updates=46600, lr=0.000292979, gnorm=0.236, clip=0, loss_scale=2, train_wall=92, gb_free=18.6, wall=45149 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1176 / 1689 loss=4.159, nll_loss=2.54, ppl=5.81, wps=459357, ups=1.06, wpb=433742, bsz=16365.4, num_updates=46700, lr=0.000292666, gnorm=0.24, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=45244 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1276 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=465307, ups=1.07, wpb=434589, bsz=16371, num_updates=46800, lr=0.000292353, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=45337 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1376 / 1689 loss=4.183, nll_loss=2.566, ppl=5.92, wps=464459, ups=1.07, wpb=433706, bsz=16626.6, num_updates=46900, lr=0.000292041, gnorm=0.253, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=45430 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 epoch 028: 1477 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=458013, ups=1.06, wpb=431547, bsz=16616.6, num_updates=47000, lr=0.00029173, gnorm=0.26, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=45525 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028 | valid on 'valid' subset | loss 4.233 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.233 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1577 / 1689 loss=4.175, nll_loss=2.557, ppl=5.89, wps=312762, ups=0.72, wpb=434371, bsz=16455, num_updates=47100, lr=0.00029142, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=97, gb_free=19.2, wall=45664 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 epoch 028: 1677 / 1689 loss=4.174, nll_loss=2.557, ppl=5.89, wps=472657, ups=1.08, wpb=436763, bsz=16788.9, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=45756 end of epoch 28 (average epoch stats below) epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 epoch 028 | loss 4.166 | nll_loss 2.547 | ppl 5.85 | wps 447324 | ups 1.03 | wpb 433533 | bsz 16505.4 | num_updates 47212 | lr 0.000291074 | gnorm 0.241 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 21.1 | wall 45766 Start iterating over samples epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 88 / 1689 loss=4.153, nll_loss=2.532, ppl=5.78, wps=455776, ups=1.06, wpb=429726, bsz=16389.6, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=45850 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 188 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=469013, ups=1.08, wpb=434289, bsz=16801.1, num_updates=47400, lr=0.000290496, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=45943 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 288 / 1689 loss=4.152, nll_loss=2.53, ppl=5.78, wps=466955, ups=1.08, wpb=433474, bsz=16316, num_updates=47500, lr=0.000290191, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=20.6, wall=46036 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 388 / 1689 loss=4.156, nll_loss=2.535, ppl=5.8, wps=468285, ups=1.08, wpb=431600, bsz=16181.5, num_updates=47600, lr=0.000289886, gnorm=0.238, clip=0, loss_scale=1, train_wall=91, gb_free=19.1, wall=46128 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 488 / 1689 loss=4.152, nll_loss=2.531, ppl=5.78, wps=464345, ups=1.07, wpb=433583, bsz=16726.3, num_updates=47700, lr=0.000289581, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=21.4, wall=46221 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 589 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=460176, ups=1.06, wpb=433106, bsz=16806.2, num_updates=47800, lr=0.000289278, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=46315 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 689 / 1689 loss=4.174, nll_loss=2.557, ppl=5.88, wps=467936, ups=1.08, wpb=433018, bsz=16513.7, num_updates=47900, lr=0.000288976, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.2, wall=46408 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 epoch 029: 789 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=462065, ups=1.07, wpb=433235, bsz=16418.5, num_updates=48000, lr=0.000288675, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=46502 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029 | valid on 'valid' subset | loss 4.231 | nll_loss 2.59 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.231 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 889 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=339659, ups=0.78, wpb=433969, bsz=16586.2, num_updates=48100, lr=0.000288375, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=46629 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 989 / 1689 loss=4.164, nll_loss=2.545, ppl=5.83, wps=465510, ups=1.08, wpb=431055, bsz=16584.7, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=46722 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1089 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=466626, ups=1.07, wpb=434665, bsz=16459.8, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=46815 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1189 / 1689 loss=4.17, nll_loss=2.552, ppl=5.87, wps=466300, ups=1.07, wpb=434409, bsz=16761.1, num_updates=48400, lr=0.00028748, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=46908 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1290 / 1689 loss=4.17, nll_loss=2.553, ppl=5.87, wps=462551, ups=1.06, wpb=435129, bsz=16637.2, num_updates=48500, lr=0.000287183, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=47002 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1390 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=467176, ups=1.08, wpb=433873, bsz=16401.1, num_updates=48600, lr=0.000286888, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=47095 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1490 / 1689 loss=4.168, nll_loss=2.55, ppl=5.86, wps=469092, ups=1.08, wpb=435094, bsz=16390.1, num_updates=48700, lr=0.000286593, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=19.5, wall=47188 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 epoch 029: 1590 / 1689 loss=4.161, nll_loss=2.542, ppl=5.82, wps=462108, ups=1.06, wpb=434213, bsz=16369.6, num_updates=48800, lr=0.000286299, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=47282 end of epoch 29 (average epoch stats below) epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 epoch 029 | loss 4.162 | nll_loss 2.542 | ppl 5.83 | wps 455089 | ups 1.05 | wpb 433534 | bsz 16508.1 | num_updates 48899 | lr 0.000286009 | gnorm 0.237 | clip 0 | loss_scale 0.5 | train_wall 1555 | gb_free 22.6 | wall 47373 Start iterating over samples epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 1 / 1689 loss=4.165, nll_loss=2.547, ppl=5.84, wps=462110, ups=1.07, wpb=432588, bsz=16100.1, num_updates=48900, lr=0.000286006, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47376 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 epoch 030: 101 / 1689 loss=4.15, nll_loss=2.529, ppl=5.77, wps=463169, ups=1.07, wpb=433829, bsz=16662.7, num_updates=49000, lr=0.000285714, gnorm=0.248, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=47469 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.242 | nll_loss 2.598 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.231 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 201 / 1689 loss=4.144, nll_loss=2.522, ppl=5.74, wps=321217, ups=0.74, wpb=433693, bsz=16225.8, num_updates=49100, lr=0.000285423, gnorm=0.237, clip=0, loss_scale=1, train_wall=118, gb_free=19.5, wall=47604 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 301 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=468044, ups=1.08, wpb=433852, bsz=16191.2, num_updates=49200, lr=0.000285133, gnorm=0.228, clip=0, loss_scale=1, train_wall=91, gb_free=18.8, wall=47697 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 401 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463316, ups=1.07, wpb=433807, bsz=16481.8, num_updates=49300, lr=0.000284844, gnorm=0.238, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=47791 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 502 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=460736, ups=1.07, wpb=432194, bsz=16742, num_updates=49400, lr=0.000284555, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=47884 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 602 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=465470, ups=1.08, wpb=432698, bsz=16541.8, num_updates=49500, lr=0.000284268, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.2, wall=47977 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 702 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=466637, ups=1.08, wpb=433728, bsz=16353.3, num_updates=49600, lr=0.000283981, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=48070 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 802 / 1689 loss=4.156, nll_loss=2.536, ppl=5.8, wps=464009, ups=1.07, wpb=434906, bsz=16375.1, num_updates=49700, lr=0.000283695, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=48164 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 902 / 1689 loss=4.171, nll_loss=2.553, ppl=5.87, wps=464722, ups=1.07, wpb=435563, bsz=16450.2, num_updates=49800, lr=0.00028341, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.9, wall=48258 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1002 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=463824, ups=1.07, wpb=432512, bsz=16482.8, num_updates=49900, lr=0.000283126, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48351 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 epoch 030: 1102 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=461919, ups=1.07, wpb=431856, bsz=16718, num_updates=50000, lr=0.000282843, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=48445 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030 | valid on 'valid' subset | loss 4.236 | nll_loss 2.597 | ppl 6.05 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.231 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1202 / 1689 loss=4.154, nll_loss=2.535, ppl=5.79, wps=393138, ups=0.91, wpb=432029, bsz=16944.6, num_updates=50100, lr=0.00028256, gnorm=0.247, clip=0, loss_scale=1, train_wall=94, gb_free=18.6, wall=48554 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1302 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=463036, ups=1.06, wpb=435764, bsz=16232.2, num_updates=50200, lr=0.000282279, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=48649 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1402 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=460270, ups=1.06, wpb=433370, bsz=16718.3, num_updates=50300, lr=0.000281998, gnorm=0.248, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=48743 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1503 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=456859, ups=1.06, wpb=432936, bsz=16615.5, num_updates=50400, lr=0.000281718, gnorm=0.239, clip=0, loss_scale=1, train_wall=93, gb_free=16.8, wall=48837 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 epoch 030: 1603 / 1689 loss=4.171, nll_loss=2.554, ppl=5.87, wps=464448, ups=1.06, wpb=437201, bsz=16526.5, num_updates=50500, lr=0.000281439, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=48932 end of epoch 30 (average epoch stats below) epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 epoch 030 | loss 4.157 | nll_loss 2.538 | ppl 5.81 | wps 446464 | ups 1.03 | wpb 433548 | bsz 16505 | num_updates 50586 | lr 0.0002812 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1581 | gb_free 20.7 | wall 49012 Start iterating over samples epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 14 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=455248, ups=1.06, wpb=430341, bsz=16405, num_updates=50600, lr=0.000281161, gnorm=0.234, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=49026 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 115 / 1689 loss=4.134, nll_loss=2.51, ppl=5.7, wps=456356, ups=1.06, wpb=431580, bsz=16391.9, num_updates=50700, lr=0.000280883, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49121 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 215 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=461043, ups=1.06, wpb=434153, bsz=16724.5, num_updates=50800, lr=0.000280607, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=49215 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 315 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458576, ups=1.06, wpb=434430, bsz=16288.6, num_updates=50900, lr=0.000280331, gnorm=0.248, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=49310 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 epoch 031: 415 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=459996, ups=1.06, wpb=432574, bsz=16403.8, num_updates=51000, lr=0.000280056, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49404 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.238 | nll_loss 2.599 | ppl 6.06 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.231 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 515 / 1689 loss=4.145, nll_loss=2.523, ppl=5.75, wps=412200, ups=0.95, wpb=435531, bsz=16664.4, num_updates=51100, lr=0.000279782, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=49509 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 615 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464770, ups=1.07, wpb=433512, bsz=16552, num_updates=51200, lr=0.000279508, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=49603 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 715 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=463349, ups=1.06, wpb=435216, bsz=16646.8, num_updates=51300, lr=0.000279236, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=49697 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 816 / 1689 loss=4.165, nll_loss=2.546, ppl=5.84, wps=462998, ups=1.06, wpb=435645, bsz=16173.2, num_updates=51400, lr=0.000278964, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=49791 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 916 / 1689 loss=4.164, nll_loss=2.545, ppl=5.84, wps=462189, ups=1.06, wpb=435418, bsz=16477.2, num_updates=51500, lr=0.000278693, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49885 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1016 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=460839, ups=1.07, wpb=430554, bsz=16676.2, num_updates=51600, lr=0.000278423, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=49978 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1116 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=464278, ups=1.07, wpb=433033, bsz=16059.2, num_updates=51700, lr=0.000278154, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=50072 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1216 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=464213, ups=1.07, wpb=433682, bsz=16635.6, num_updates=51800, lr=0.000277885, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=50165 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1316 / 1689 loss=4.157, nll_loss=2.537, ppl=5.81, wps=465889, ups=1.08, wpb=432901, bsz=16334.2, num_updates=51900, lr=0.000277617, gnorm=0.229, clip=0, loss_scale=1, train_wall=91, gb_free=19.4, wall=50258 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 epoch 031: 1417 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=455714, ups=1.05, wpb=434688, bsz=16921.4, num_updates=52000, lr=0.00027735, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=50353 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031 | valid on 'valid' subset | loss 4.236 | nll_loss 2.594 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.231 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1517 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=405759, ups=0.94, wpb=433234, bsz=16923, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.4, wall=50460 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 epoch 031: 1617 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=469443, ups=1.08, wpb=434800, bsz=16193.7, num_updates=52200, lr=0.000276818, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=50553 end of epoch 31 (average epoch stats below) epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 epoch 031 | loss 4.153 | nll_loss 2.533 | ppl 5.79 | wps 454629 | ups 1.05 | wpb 433523 | bsz 16504.5 | num_updates 52272 | lr 0.000276628 | gnorm 0.239 | clip 0 | loss_scale 0.5 | train_wall 1558 | gb_free 20.3 | wall 50619 Start iterating over samples epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 28 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=458368, ups=1.07, wpb=428923, bsz=16640.4, num_updates=52300, lr=0.000276553, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.3, wall=50646 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 128 / 1689 loss=4.143, nll_loss=2.521, ppl=5.74, wps=465995, ups=1.08, wpb=432231, bsz=16753.1, num_updates=52400, lr=0.000276289, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50739 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 228 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=463881, ups=1.07, wpb=434104, bsz=16518.6, num_updates=52500, lr=0.000276026, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=50833 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 328 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=462923, ups=1.07, wpb=432139, bsz=16496.5, num_updates=52600, lr=0.000275764, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=50926 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 428 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=463952, ups=1.07, wpb=435030, bsz=16597.3, num_updates=52700, lr=0.000275502, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=17.4, wall=51020 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 529 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=458784, ups=1.06, wpb=434489, bsz=16736.5, num_updates=52800, lr=0.000275241, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51114 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 629 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=460052, ups=1.06, wpb=432570, bsz=16361.3, num_updates=52900, lr=0.000274981, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=51208 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 epoch 032: 729 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=460602, ups=1.06, wpb=433221, bsz=16627, num_updates=53000, lr=0.000274721, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=51303 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032 | valid on 'valid' subset | loss 4.23 | nll_loss 2.589 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.23 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 829 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=298996, ups=0.69, wpb=435578, bsz=16043.7, num_updates=53100, lr=0.000274462, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=51448 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 929 / 1689 loss=4.155, nll_loss=2.535, ppl=5.79, wps=470982, ups=1.09, wpb=433718, bsz=16555.3, num_updates=53200, lr=0.000274204, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=51540 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1029 / 1689 loss=4.147, nll_loss=2.526, ppl=5.76, wps=467291, ups=1.08, wpb=432931, bsz=16103.6, num_updates=53300, lr=0.000273947, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51633 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1129 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466500, ups=1.07, wpb=435319, bsz=16492.7, num_updates=53400, lr=0.00027369, gnorm=0.241, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=51726 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1229 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=466616, ups=1.08, wpb=433483, bsz=16154.2, num_updates=53500, lr=0.000273434, gnorm=0.243, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=51819 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1329 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=464214, ups=1.07, wpb=432753, bsz=16307.1, num_updates=53600, lr=0.000273179, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=51912 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1429 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=463781, ups=1.07, wpb=433225, bsz=16777.9, num_updates=53700, lr=0.000272925, gnorm=0.242, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=52006 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1530 / 1689 loss=4.154, nll_loss=2.535, ppl=5.8, wps=458775, ups=1.05, wpb=435666, bsz=16671.8, num_updates=53800, lr=0.000272671, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=52101 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 epoch 032: 1631 / 1689 loss=4.143, nll_loss=2.522, ppl=5.75, wps=455602, ups=1.05, wpb=433116, bsz=16646.4, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=52196 end of epoch 32 (average epoch stats below) epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 epoch 032 | loss 4.149 | nll_loss 2.529 | ppl 5.77 | wps 448447 | ups 1.03 | wpb 433521 | bsz 16505.1 | num_updates 53958 | lr 0.000272271 | gnorm 0.236 | clip 0 | loss_scale 0.5 | train_wall 1556 | gb_free 22.2 | wall 52249 Start iterating over samples epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 epoch 033: 42 / 1689 loss=4.16, nll_loss=2.541, ppl=5.82, wps=462980, ups=1.07, wpb=431221, bsz=16311.7, num_updates=54000, lr=0.000272166, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.9, wall=52289 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.235 | nll_loss 2.591 | ppl 6.02 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.23 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 142 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=330423, ups=0.77, wpb=431725, bsz=16573.3, num_updates=54100, lr=0.000271914, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=112, gb_free=18.7, wall=52420 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 242 / 1689 loss=4.145, nll_loss=2.524, ppl=5.75, wps=461251, ups=1.06, wpb=433423, bsz=16749.4, num_updates=54200, lr=0.000271663, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=52514 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 342 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=466516, ups=1.08, wpb=433852, bsz=16275.6, num_updates=54300, lr=0.000271413, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=91, gb_free=18.6, wall=52607 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 442 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=461513, ups=1.07, wpb=432772, bsz=16422.2, num_updates=54400, lr=0.000271163, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=52700 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 543 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=457797, ups=1.05, wpb=434654, bsz=16467.3, num_updates=54500, lr=0.000270914, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=52795 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 643 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=461237, ups=1.06, wpb=433898, bsz=16789.8, num_updates=54600, lr=0.000270666, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.1, wall=52889 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 743 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=463362, ups=1.07, wpb=432549, bsz=16454.2, num_updates=54700, lr=0.000270418, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=52983 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 843 / 1689 loss=4.146, nll_loss=2.525, ppl=5.76, wps=462117, ups=1.06, wpb=434232, bsz=16711.5, num_updates=54800, lr=0.000270172, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=53077 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 943 / 1689 loss=4.141, nll_loss=2.519, ppl=5.73, wps=463275, ups=1.07, wpb=433943, bsz=15984, num_updates=54900, lr=0.000269925, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=53170 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 epoch 033: 1044 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=453806, ups=1.05, wpb=432209, bsz=16585.7, num_updates=55000, lr=0.00026968, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=53266 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033 | valid on 'valid' subset | loss 4.243 | nll_loss 2.606 | ppl 6.09 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.23 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1144 / 1689 loss=4.152, nll_loss=2.532, ppl=5.78, wps=350117, ups=0.81, wpb=434869, bsz=16450, num_updates=55100, lr=0.000269435, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=108, gb_free=21, wall=53390 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1244 / 1689 loss=4.152, nll_loss=2.533, ppl=5.79, wps=462254, ups=1.06, wpb=434258, bsz=17177.8, num_updates=55200, lr=0.000269191, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=53484 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1344 / 1689 loss=4.158, nll_loss=2.539, ppl=5.81, wps=463006, ups=1.07, wpb=432197, bsz=16397.4, num_updates=55300, lr=0.000268947, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=53577 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1444 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=464230, ups=1.07, wpb=433153, bsz=16790, num_updates=55400, lr=0.000268705, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=92, gb_free=21.2, wall=53670 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1544 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=464554, ups=1.07, wpb=435192, bsz=16702.6, num_updates=55500, lr=0.000268462, gnorm=0.233, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=53764 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 epoch 033: 1645 / 1689 loss=4.156, nll_loss=2.537, ppl=5.8, wps=459320, ups=1.06, wpb=435312, bsz=16042.3, num_updates=55600, lr=0.000268221, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=53859 end of epoch 33 (average epoch stats below) epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 epoch 033 | loss 4.146 | nll_loss 2.525 | ppl 5.76 | wps 442946 | ups 1.02 | wpb 433535 | bsz 16504.2 | num_updates 55644 | lr 0.000268115 | gnorm 0.235 | clip 0 | loss_scale 0.5 | train_wall 1592 | gb_free 21.6 | wall 53899 Start iterating over samples epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 56 / 1689 loss=4.139, nll_loss=2.517, ppl=5.72, wps=460699, ups=1.07, wpb=431584, bsz=16401.4, num_updates=55700, lr=0.00026798, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=53953 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 156 / 1689 loss=4.135, nll_loss=2.513, ppl=5.71, wps=460485, ups=1.06, wpb=434157, bsz=16696.7, num_updates=55800, lr=0.00026774, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=54047 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 256 / 1689 loss=4.128, nll_loss=2.505, ppl=5.68, wps=459690, ups=1.06, wpb=431895, bsz=16206.5, num_updates=55900, lr=0.0002675, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54141 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 epoch 034: 356 / 1689 loss=4.135, nll_loss=2.512, ppl=5.7, wps=459227, ups=1.06, wpb=433038, bsz=16340.5, num_updates=56000, lr=0.000267261, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=54235 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034 | valid on 'valid' subset | loss 4.228 | nll_loss 2.587 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.228 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 456 / 1689 loss=4.139, nll_loss=2.517, ppl=5.73, wps=386378, ups=0.89, wpb=434414, bsz=16580.2, num_updates=56100, lr=0.000267023, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=20.2, wall=54348 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 557 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=458880, ups=1.06, wpb=433459, bsz=16568, num_updates=56200, lr=0.000266785, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=54442 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 657 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=461358, ups=1.07, wpb=432221, bsz=16649.5, num_updates=56300, lr=0.000266548, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=54536 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 757 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=465631, ups=1.07, wpb=433703, bsz=16616.1, num_updates=56400, lr=0.000266312, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=54629 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 857 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=473122, ups=1.09, wpb=434432, bsz=16449.4, num_updates=56500, lr=0.000266076, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=91, gb_free=20.3, wall=54721 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 957 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=466048, ups=1.08, wpb=433212, bsz=16550.5, num_updates=56600, lr=0.000265841, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=54814 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1058 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=460432, ups=1.07, wpb=431834, bsz=16323.5, num_updates=56700, lr=0.000265606, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54907 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1158 / 1689 loss=4.143, nll_loss=2.522, ppl=5.74, wps=463556, ups=1.07, wpb=432186, bsz=16561.4, num_updates=56800, lr=0.000265372, gnorm=0.236, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.6, wall=55001 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1258 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=464200, ups=1.07, wpb=433377, bsz=16636.6, num_updates=56900, lr=0.000265139, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55094 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 epoch 034: 1358 / 1689 loss=4.149, nll_loss=2.53, ppl=5.77, wps=466118, ups=1.07, wpb=434599, bsz=16675.8, num_updates=57000, lr=0.000264906, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.5, wall=55187 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034 | valid on 'valid' subset | loss 4.225 | nll_loss 2.584 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.225 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1458 / 1689 loss=4.151, nll_loss=2.532, ppl=5.78, wps=342399, ups=0.79, wpb=434795, bsz=16398.2, num_updates=57100, lr=0.000264674, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=106, gb_free=19, wall=55314 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1558 / 1689 loss=4.15, nll_loss=2.53, ppl=5.77, wps=472510, ups=1.08, wpb=436649, bsz=16352.2, num_updates=57200, lr=0.000264443, gnorm=0.252, clip=0, loss_scale=1, train_wall=91, gb_free=18.5, wall=55407 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 epoch 034: 1658 / 1689 loss=4.153, nll_loss=2.534, ppl=5.79, wps=469505, ups=1.08, wpb=435236, bsz=16335.9, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=55499 end of epoch 34 (average epoch stats below) epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 epoch 034 | loss 4.142 | nll_loss 2.521 | ppl 5.74 | wps 449225 | ups 1.04 | wpb 433528 | bsz 16503 | num_updates 57331 | lr 0.000264141 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 20.2 | wall 55527 Start iterating over samples epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 69 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459997, ups=1.07, wpb=430449, bsz=16381, num_updates=57400, lr=0.000263982, gnorm=0.243, clip=0, loss_scale=1, train_wall=91, gb_free=20.4, wall=55593 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 169 / 1689 loss=4.133, nll_loss=2.51, ppl=5.7, wps=468983, ups=1.08, wpb=433197, bsz=16423.8, num_updates=57500, lr=0.000263752, gnorm=0.225, clip=0, loss_scale=1, train_wall=92, gb_free=18.2, wall=55685 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 270 / 1689 loss=4.126, nll_loss=2.502, ppl=5.66, wps=460083, ups=1.06, wpb=433674, bsz=16451, num_updates=57600, lr=0.000263523, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55780 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 370 / 1689 loss=4.129, nll_loss=2.506, ppl=5.68, wps=464538, ups=1.07, wpb=434630, bsz=16719.6, num_updates=57700, lr=0.000263295, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=55873 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 470 / 1689 loss=4.148, nll_loss=2.527, ppl=5.76, wps=463757, ups=1.07, wpb=434783, bsz=16400.8, num_updates=57800, lr=0.000263067, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=55967 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 570 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=459809, ups=1.06, wpb=434404, bsz=16296.6, num_updates=57900, lr=0.00026284, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=56061 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 epoch 035: 670 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=464522, ups=1.07, wpb=434694, bsz=16582.7, num_updates=58000, lr=0.000262613, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56155 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.238 | nll_loss 2.595 | ppl 6.04 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.225 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 771 / 1689 loss=4.144, nll_loss=2.523, ppl=5.75, wps=334579, ups=0.77, wpb=436001, bsz=16831.8, num_updates=58100, lr=0.000262387, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=97, gb_free=18.7, wall=56285 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 871 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=467669, ups=1.08, wpb=432165, bsz=16623.4, num_updates=58200, lr=0.000262161, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=56378 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 971 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=464182, ups=1.07, wpb=432470, bsz=16610.6, num_updates=58300, lr=0.000261936, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=56471 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1071 / 1689 loss=4.147, nll_loss=2.527, ppl=5.76, wps=468712, ups=1.08, wpb=435017, bsz=16207.5, num_updates=58400, lr=0.000261712, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=56564 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1171 / 1689 loss=4.149, nll_loss=2.528, ppl=5.77, wps=464288, ups=1.08, wpb=431717, bsz=16288.5, num_updates=58500, lr=0.000261488, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=56657 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1271 / 1689 loss=4.14, nll_loss=2.519, ppl=5.73, wps=460448, ups=1.07, wpb=431673, bsz=16225.8, num_updates=58600, lr=0.000261265, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=56750 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1371 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=462711, ups=1.07, wpb=433120, bsz=16467.7, num_updates=58700, lr=0.000261042, gnorm=0.236, clip=0, loss_scale=1, train_wall=92, gb_free=17.8, wall=56844 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1472 / 1689 loss=4.153, nll_loss=2.533, ppl=5.79, wps=457735, ups=1.05, wpb=435894, bsz=16629.7, num_updates=58800, lr=0.00026082, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.9, wall=56939 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1572 / 1689 loss=4.141, nll_loss=2.521, ppl=5.74, wps=463377, ups=1.07, wpb=433717, bsz=16834.9, num_updates=58900, lr=0.000260599, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=57033 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 epoch 035: 1672 / 1689 loss=4.15, nll_loss=2.53, ppl=5.78, wps=461802, ups=1.07, wpb=432624, bsz=16694.7, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=57127 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 epoch 035 | valid on 'valid' subset | loss 4.225 | nll_loss 2.588 | ppl 6.01 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.225 end of epoch 35 (average epoch stats below) epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 epoch 035 | loss 4.139 | nll_loss 2.517 | ppl 5.72 | wps 440972 | ups 1.02 | wpb 433542 | bsz 16506.5 | num_updates 59017 | lr 0.00026034 | gnorm 0.234 | clip 0 | loss_scale 0.5 | train_wall 1579 | gb_free 21.7 | wall 57185 Start iterating over samples epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 83 / 1689 loss=4.127, nll_loss=2.504, ppl=5.67, wps=314158, ups=0.73, wpb=431786, bsz=16626.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.4, wall=57264 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 183 / 1689 loss=4.122, nll_loss=2.498, ppl=5.65, wps=458207, ups=1.06, wpb=433454, bsz=16541.4, num_updates=59200, lr=0.000259938, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=57359 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 283 / 1689 loss=4.126, nll_loss=2.502, ppl=5.67, wps=464655, ups=1.07, wpb=432592, bsz=16302.2, num_updates=59300, lr=0.000259718, gnorm=0.226, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=57452 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 383 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=463212, ups=1.07, wpb=433030, bsz=16630, num_updates=59400, lr=0.0002595, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=57545 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 483 / 1689 loss=4.131, nll_loss=2.508, ppl=5.69, wps=463233, ups=1.07, wpb=432348, bsz=16621.7, num_updates=59500, lr=0.000259281, gnorm=0.235, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=57639 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 583 / 1689 loss=4.125, nll_loss=2.501, ppl=5.66, wps=458109, ups=1.06, wpb=431124, bsz=16478.2, num_updates=59600, lr=0.000259064, gnorm=0.238, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=57733 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 683 / 1689 loss=4.133, nll_loss=2.511, ppl=5.7, wps=463061, ups=1.07, wpb=431974, bsz=16224.2, num_updates=59700, lr=0.000258847, gnorm=0.229, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=57826 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 784 / 1689 loss=4.137, nll_loss=2.515, ppl=5.71, wps=459192, ups=1.06, wpb=434234, bsz=16114.2, num_updates=59800, lr=0.00025863, gnorm=0.233, clip=0, loss_scale=1, train_wall=93, gb_free=20.8, wall=57921 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 884 / 1689 loss=4.13, nll_loss=2.507, ppl=5.68, wps=464047, ups=1.07, wpb=434126, bsz=16527, num_updates=59900, lr=0.000258414, gnorm=0.245, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=58014 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 epoch 036: 984 / 1689 loss=4.134, nll_loss=2.512, ppl=5.7, wps=462652, ups=1.06, wpb=434701, bsz=16371.4, num_updates=60000, lr=0.000258199, gnorm=0.228, clip=0, loss_scale=1, train_wall=92, gb_free=18.1, wall=58108 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 epoch 036 | valid on 'valid' subset | loss 4.225 | nll_loss 2.586 | ppl 6 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.225 end of epoch 36 (average epoch stats below) epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 epoch 036 | loss 4.129 | nll_loss 2.506 | ppl 5.68 | wps 451506 | ups 1.04 | wpb 433260 | bsz 16457.2 | num_updates 60000 | lr 0.000258199 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 907 | gb_free 18.1 | wall 58128 done training in 58116.2 seconds